In [None]:
# Install fastai
!pip install fastai -q
!pip install fastai --upgrade

In [166]:
from fastai.tabular.all import *
import pandas as pd
import csv

In [167]:
# Load the csv as a dataframe
path = Path(".")
df1 = pd.read_csv(path/'merged_df1.csv')
# Remove the id column from R's dataframe
del df1['Unnamed: 0']

In [168]:
df1.head

<bound method NDFrame.head of     age    sex   city  ... chest pain diarrhoea physical discomfort
0     1  6a50b  7a6d2  ...      False     False               False
1    40  6a50b  593f0  ...      False     False               False
2    56  8ef1a  17787  ...      False     False               False
3    44  6a50b  a3f17  ...      False     False               False
4    31  8ef1a  8538a  ...      False     False               False
..   ..    ...    ...  ...        ...       ...                 ...
633  75  8a467  d7cac  ...      False     False               False
634  80  1e6ee  c4d2f  ...      False     False               False
635  55  8a467  bc45c  ...      False     False               False
636  57  d516d  73c3e  ...      False     False               False
637  51  8a467  56626  ...      False     False               False

[638 rows x 86 columns]>

In [169]:
print(df1)

    age    sex   city  ... chest pain diarrhoea physical discomfort
0     1  6a50b  7a6d2  ...      False     False               False
1    40  6a50b  593f0  ...      False     False               False
2    56  8ef1a  17787  ...      False     False               False
3    44  6a50b  a3f17  ...      False     False               False
4    31  8ef1a  8538a  ...      False     False               False
..   ..    ...    ...  ...        ...       ...                 ...
633  75  8a467  d7cac  ...      False     False               False
634  80  1e6ee  c4d2f  ...      False     False               False
635  55  8a467  bc45c  ...      False     False               False
636  57  d516d  73c3e  ...      False     False               False
637  51  8a467  56626  ...      False     False               False

[638 rows x 86 columns]


In [170]:
list(df1.columns)[5]

'V1'

In [171]:
list(df1.columns)[6]

'duration'

In [172]:
list(df1.columns)[7]

'clean_age'

In [173]:
list(df1.columns)[8]

'diarrhea'

In [174]:
# Get a list of categorical columns
df1_cat = list(df1.columns)

df1_cat = df1_cat[0:6] + df1_cat[8: ]
# Only age and duration are numerical, the others are categorical

In [175]:
# Get a list of numerical columns, not counting the thing we're predicting
df1_num = list(df1.columns)[7:8]
# The only numerical columns are the age and duration
# We don't include duration because we are predicting it
list(df1.columns)[7]
# This column should be about age

'clean_age'

In [176]:
# Use the fast.ai tabular dataloader

dls = TabularDataLoaders.from_csv(
    path/'merged_df1.csv', y_names = "duration",
    cat_names = df1_cat, cont_names = df1_num,
    procs = [Categorify, FillMissing, Normalize]
    )

learn = tabular_learner(dls, y_range = (0, 33),  metrics = accuracy)

In [177]:
# I determined the y_range by using
# ```max(merged_df1$duration)```
# in R and adding one to the maximum. I remember it being recommended
# by the fast.ai prof, I think because the neuron is asymptotic to
# the maximum value, I'm not entirely sure.

In [178]:
# Training and validation splits
splits = RandomSplitter(valid_pct=0.2)(range_of(df1))

In [179]:
to = TabularPandas(df1, procs = [Categorify, FillMissing, Normalize],
                   cat_names = df1_cat, cont_names = df1_num,
                   y_names = 'duration',
                   splits = splits)

dls = to.dataloaders(bs = 16)

In [221]:
learn = tabular_learner(dls, y_range = (0, 33), metrics = rmse)

In [223]:
learn.fit_one_cycle(12, lr_max = 0.005, wd=0.80, moms=(0.95, 0.95, 0.05))

epoch,train_loss,valid_loss,_rmse,time
0,3.245505,12.706243,3.564582,00:00
1,4.107022,17.028053,4.126507,00:00
2,6.310671,15.532588,3.94114,00:00
3,9.830233,37.481972,6.122253,00:00
4,10.616019,18.43276,4.293339,00:00
5,9.453475,13.832024,3.719142,00:00
6,8.559956,16.367435,4.045669,00:00
7,8.231226,13.58573,3.685883,00:00
8,6.325265,12.417013,3.523778,00:00
9,4.857846,11.091792,3.330434,00:00


In [224]:
learn.export()

In [225]:
df2 = pd.read_csv(path/'test2_cleaned.csv')

In [226]:
list(df2.columns)[0]

'Unnamed: 0'

In [227]:
# Remove the id column from R's dataframe
del df2['Unnamed: 0']

In [228]:
splits2 = RandomSplitter(valid_pct=0.2)(range_of(df2))

In [231]:
# test_df = df2
# test_df.drop(['duration'], axis=1, inplace=True)
dl = learn.dls.test_dl(df2)

In [232]:
p1 = learn.get_preds(dl = dl)

In [233]:
p2 = p1[0].tolist()

In [234]:
 with open('testing.csv', mode='w') as employee_file:
   employee_writer = csv.writer(employee_file, delimiter = ',',
                                quotechar = '"', quoting = csv.QUOTE_MINIMAL)
   employee_writer.writerow(p2)
print("Done")

Done
