In [None]:
import numpy as np
import pandas as pd
import impyute
import matplotlib.pyplot as plt
from sklearn import preprocessing

In [None]:
def loadfile(filename):
    #Load in the specified csv file
    file = pd.read_csv(filename,header=0)
    file = file.replace("#DIV/0!", np.nan)
    return file

def getMissingFeatures(file, filename):
    #Provides a report on the missing percentage of each feature
    if 'stats' in filename:
        end = 35
    else:
        end = 5
    
    columns = file.columns
    percent_missing = file.isnull().sum() * 100 / len(file)
    missing_value_stats = pd.DataFrame({'column_name': columns,
                                 'percent_missing': percent_missing})

    missing_value_stats.sort_values(by=['percent_missing'],inplace=True, ascending=False)
    plt.bar(missing_value_stats['column_name'][0:end],missing_value_stats['percent_missing'][0:end])
    plt.xticks(rotation=90)
    plt.xlabel('Features')
    plt.ylabel('Percent Missing')
    plt.show()
    numMissing = missing_value_stats['percent_missing'].astype(bool).sum(axis=0)
    print("Number of features with missing values: ",numMissing)
    print("Percent missing: ", missing_value_stats['percent_missing'][0:numMissing])
    return

def imputeAndNormalize(file,filename):
    #drop the columns with 100% missing
    file = file.loc[:, file.isnull().mean() < 1]
    
    if 'stats' in filename:
        numericVals = file.loc[:, ~file.columns.isin(['user_name', 'classe'])]
    else:
        numericVals = file.loc[:, ~file.columns.isin(['user_name','num_window','classe'])]
    
    numericValsRep = numericVals.values
    numericValsRep = numericValsRep.astype(float)
    imputed = impyute.em(numericValsRep)
    imputed = preprocessing.scale(imputed)
    
    out = pd.DataFrame(imputed, columns=numericVals.columns)
    out.insert(loc=0, column='user_name', value=file.user_name)
    if 'raw' in filename:
        #There is an extra column to include for the raw data
        out.insert(loc=1, column='num_window', value=file.num_window)
    out.insert(loc=len(out.columns), column='classe', value=file.classe)
    
    out.classe = out.classe.replace("A", 0)
    out.classe = out.classe.replace("B", 1)
    out.classe = out.classe.replace("C", 2)
    out.classe = out.classe.replace("D", 3)
    out.classe = out.classe.replace("E", 4)

    saveFileName = 'normalized_' + filename
    out.to_csv(saveFileName, index=False, header=True)
    return

In [None]:
statsfile = "stats.csv"
rawfile = "raw.csv"

stats = loadfile(statsfile)
raw = loadfile(rawfile)

getMissingFeatures(stats,statsfile)
getMissingFeatures(raw,rawfile)

imputeAndNormalize(stats,statsfile)
imputeAndNormalize(raw, rawfile)