In [68]:
import pandas as pd
import numpy as np

In [69]:
import os
parDir = os.path.normpath(os.getcwd() + os.sep + os.pardir)

In [70]:
feaDir = parDir + '\\DataSets\\features'
labDir = parDir + '\\DataSets\\labels'
merDSDit = parDir + "\\DataSets\\MergedDataset\\Final.csv"

In [71]:
inputCols=['bookingID','Accuracy','Bearing','acceleration_x','acceleration_y',
           'acceleration_z','gyro_x','gyro_y','gyro_z','second','Speed']
targetCol='label'

In [72]:
def readFilesFromFolder(dirToRead):
    files = []
    #r=root,d=directories,f=file
    for r,d,f in os.walk(dirToRead):
        for file in f:
            if '.csv' in file:
                files.append(os.path.join(r,file))
    return files
    

In [73]:
def concatAllFilesInDir(file,inputCols):
    np_array_list = []
    if(len(file)>1):
        for f in file:
            df = pd.read_csv(f, index_col=None, header=0)
            np_array_list.append(df.as_matrix())
        comb_np_array = np.vstack(np_array_list)
        big_frame = pd.DataFrame(comb_np_array)
        big_frame.columns = inputCols
    else:
        for f in file:
            big_frame = pd.DataFrame(pd.read_csv(f))
    return big_frame

In [74]:
def mergeBasedonColumn(table1,table2,colName):
    merged_table=pd.merge(table1,table2,how='left',on=colName)
    return merged_table

In [75]:
files=readFilesFromFolder(feaDir)
frame=concatAllFilesInDir(files,inputCols)

In [76]:
file_lab=readFilesFromFolder(labDir)
frame_lab = concatAllFilesInDir(file_lab,inputCols)


In [77]:
expected_table = mergeBasedonColumn(frame, frame_lab, 'bookingID')

# From the DataExploration class we have found that there is class imbalance
we will try to undersample the datasets as it is causing the model to train more faster and also giving us better predictions.


In [78]:
#There is a class imbalance so we are trying to undersample the data for better predictions.
def undersample(df,inputCols,targetCol):
    #Downsampling code
    #Class Count
    count_class_0,count_class_1 = df[targetCol].value_counts()
    # Divide by class
    df_class_0 = df[df[targetCol] == 0]
    df_class_1 = df[df[targetCol] == 1]
    #Random Under Sampling
    df_class_0_under = df_class_0.sample(count_class_1)
    df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)
    return df_test_under
   

In [79]:
df_undersample = undersample(expected_table,inputCols,targetCol)

In [80]:
df_undersample.to_csv(merDSDit,index=False)

In [81]:
from sklearn.model_selection import train_test_split

In [82]:
X_train,X_test,y_train,y_test = train_test_split(df_undersample[inputCols],df_undersample[targetCol],test_size=0.1,random_state=0)

In [84]:
X_test.to_csv(parDir+"\\DataSets\\TestFileUpload\\test.csv")