In [102]:
import pandas as pd
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px
import seaborn as sns
from scipy.stats import spearmanr

init_notebook_mode(connected=True)

In [111]:
df = pd.read_csv('final_train.csv', index_col=0)

In [112]:
df

Unnamed: 0,Activity,"angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyAccMean,gravity)","angle(tBodyGyroJerkMean,gravityMean)","angle(tBodyGyroMean,gravityMean)",energy-mean(),"fBodyAcc-bandsEnergy()-1,16",...,tGravityAccMag-energy(),tGravityAccMag-entropy(),tGravityAccMag-iqr(),tGravityAccMag-mad(),tGravityAccMag-max(),tGravityAccMag-mean(),tGravityAccMag-min(),tGravityAccMag-sma(),tGravityAccMag-std(),void()
0,STANDING,-1.574191,-0.805700,-2.453831,-1.861714,0.115517,-2.023369,3.125860,-9.474436,3.622600,...,5.049130,3.272281,5.103018,-0.823302,3.063681,-0.928676,-1.007695,3.094236,1.669987,1.0
1,WALKING_UPSTAIRS,-1.568669,-0.612620,-2.488338,-1.751117,0.218134,0.280294,7.953403,-0.679894,2.512880,...,4.565443,4.440779,3.288399,-2.689605,1.705341,-0.822617,-0.572410,-2.187890,-0.544652,2.0
2,LAYING,-1.548333,-0.139628,-2.380889,-1.915239,0.067613,-0.887342,6.007361,-11.906904,3.621984,...,5.059609,2.296603,5.344174,-0.549843,3.272225,-0.940610,-1.031522,-2.071755,2.024442,3.0
3,WALKING,-1.574094,-0.793443,-2.449959,-1.842741,-0.195625,-0.950671,2.104353,-11.906121,3.355701,...,4.860758,4.177535,3.950012,-0.322642,2.366287,-0.866937,-0.696431,1.863375,0.550414,2.0
4,LAYING,-1.548333,-0.139628,-2.380889,,0.067613,-0.887342,6.007361,-11.970485,3.621984,...,5.059609,2.296603,5.344174,-0.549843,3.272225,-0.940610,-1.031522,-2.102513,2.024442,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11081,WALKING_UPSTAIRS,-1.568669,-0.612620,-2.488338,-1.751117,0.218134,0.280294,7.953403,-0.679894,2.512880,...,4.565443,4.440779,3.288399,-2.689605,1.705341,-0.822617,-0.572410,3.534816,-0.544652,2.0
11082,LAYING,-1.550288,-0.231565,-2.376561,-1.868524,0.007653,-0.077020,5.494466,-5.860692,3.603169,...,5.048737,2.923856,5.182026,-0.998495,2.853755,-0.933561,-1.000173,-1.984575,1.372763,1.0
11083,WALKING_UPSTAIRS,-1.569815,-0.794940,-2.482325,-1.784151,0.021844,-0.740329,8.005261,-11.916107,2.644861,...,4.453514,4.302852,2.979961,-6.032382,1.853029,-0.802817,0.041975,3.427548,-0.075184,1.0
11084,WALKING_DOWNSTAIRS,-1.574768,-0.752847,-2.449529,-1.737755,-0.189761,-0.454246,8.054443,-5.611277,2.473134,...,4.615939,4.443277,3.208597,-6.200222,1.675269,-0.828934,-0.877004,3.385405,-0.383217,3.0


In [103]:
sns.set()
plt.figure(figsize=(16, 12))
px.histogram(df['Activity'])

<Figure size 1152x864 with 0 Axes>

In [92]:
from sklearn.metrics import mutual_info_score


def rank_features(df, features, labels, fun, index=None):
    '''
    fun - function used to calculate the informativity of features
    index - optional, if the function returns a result in a tuple/list, then index specifies at which 
            index the considreredmetric is found 
    '''
    #df_results = pd.DataFrame(columns = ['Feature, Informativity'])
    results = []
    for feature in features:
        result = fun(df[feature], labels)
        if index is not None:
            result = abs(result[index])
        results.append({'Feature': feature, 'Informativity': result})
        
    return pd.DataFrame(results).sort_values(by='Informativity').reset_index(drop=True)


features = df.columns[2:]
labels = df['Activity']


spearman = rank_features(df, features, labels, spearmanr, index=0)
spearman   

Unnamed: 0,Feature,Informativity
0,fBodyAcc-maxInds-Y,0.002213
1,tGravityAcc-entropy()-X,0.008071
2,"tGravityAcc-arCoeff()-X,3",0.016890
3,tBodyAccJerk-mean()-Z,0.019036
4,tBodyGyro-low()-Y,0.027225
...,...,...
566,fBodyGyro-std()-Z,0.810153
567,fBodyAcc-mad()-Z,0.810177
568,"fBodyAcc-bandsEnergy()-1,24.2",0.816108
569,fBodyAcc-std()-Z,0.822293


In [90]:

numeric_labels = labels.map(activities_codes)
mutual_info = rank_features(df, features, labels, mutual_info_score)
mutual_info

Unnamed: 0,Feature,Informativity
0,void(),0.116288
1,subject,0.342704
2,fBodyGyro-maxInds-X,0.418439
3,fBodyAccJerk-maxInds-Z,0.433212
4,fBodyAccJerk-maxInds-Y,0.449820
...,...,...
566,"tBodyAcc-correlation()-X,Y",1.781410
567,"tBodyAccJerk-arCoeff()-Z,4",1.781410
568,"tBodyGyroJerk-arCoeff()-Y,4",1.781410
569,fBodyGyro-skewness()-Y,1.781410


In [89]:
spearman.loc[spearman['Feature'] == 'void()']

Unnamed: 0,Feature,Informativity
75,void(),0.207862


In [93]:
activities = list(df['Activity'].unique())
activities_codes = {label:i for i, label in enumerate(activities)}
#activities_codes = {v:k for k,v in activities_codes.items()}
activities_codes


{'STANDING': 0,
 'WALKING_UPSTAIRS': 1,
 'LAYING': 2,
 'WALKING': 3,
 'SITTING': 4,
 'WALKING_DOWNSTAIRS': 5}

In [110]:
mutual_info.head(50)

Unnamed: 0,Feature,Informativity
0,void(),0.116288
1,subject,0.342704
2,fBodyGyro-maxInds-X,0.418439
3,fBodyAccJerk-maxInds-Z,0.433212
4,fBodyAccJerk-maxInds-Y,0.44982
5,fBodyBodyGyroMag-maxInds,0.481072
6,fBodyGyro-maxInds-Y,0.497251
7,fBodyBodyGyroJerkMag-maxInds,0.520809
8,fBodyBodyAccJerkMag-maxInds,0.530138
9,fBodyGyro-maxInds-Z,0.577295


In [105]:
px.line(spearman, x='Feature', y='Informativity')

In [107]:
px.histogram(spearman, x='Feature', y='Informativity')

In [109]:
px.line(mutual_info, x='Feature', y='Informativity')

In [122]:
from sklearn import preprocessing



X = df.drop('Activity', axis='columns')
X = X.fillna(X.mean())

def normalize(df):
    cols = df.columns
    x = df.values #returns a numpy array
    min_max_scaler = preprocessing.StandardScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    return pd.DataFrame(x_scaled, columns=cols)

X = normalize(X)

In [123]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

y = labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

model_lr = LogisticRegression(random_state=69).fit(X_train, y_train)







In [124]:
model_lr.score(X_test, y_test)

0.9824165915238954