# Problem Statement

In [3]:
import pandas as pd
import numpy
import os
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
def process(path_to_folder):
    train = []
    for root, dirs, files in os.walk(path_to_folder):
        for file in files:
            if file.endswith('.txt'):
                with open(os.path.join(root, file), 'r') as f:
                    text = f.read()
                    temp = text.split(';\n')
                    final = []
                    for i in range (len(temp)):
                        a = temp[i].split(',')
                        final.append(a)
                
                    train = train[:] + final
                    
    return train

In [5]:
trainphoneaccel = process('/content/drive/MyDrive/Raw/raw/train/phone/accel')
trainphonegyro = process('/content/drive/MyDrive/Raw/raw/train/phone/gyro')
trainwatchaccel = process('/content/drive/MyDrive/Raw/raw/train/watch/accel')
trainwatchgyro = process('/content/drive/MyDrive/Raw/raw/train/watch/gyro')


In [6]:
trainwatchaccel[:5]

[['1600', 'A', '90426708196641', '7.091625', '-0.5916671', '8.195502'],
 ['1600', 'A', '90426757696641', '4.972757', '-0.15831658', '6.6967316'],
 ['1600', 'A', '90426807196641', '3.25372', '-0.19183542', '6.107758'],
 ['1600', 'A', '90426856696641', '2.801216', '-0.15592238', '5.997625'],
 ['1600', 'A', '90426906196641', '3.7708676', '-1.0513538', '7.731027']]

In [7]:
train = trainphoneaccel + trainphonegyro + trainwatchaccel + trainwatchgyro
len(train)

5575090

In [8]:
def transform(data):
    data = pd.DataFrame(data, columns = ['Subject-id', 'Activity Label', 'Timestamp', 'x', 'y', 'z'])
    return data

In [9]:
train = transform(train)

In [10]:
train.head()

Unnamed: 0,Subject-id,Activity Label,Timestamp,x,y,z
0,1600,A,252207666810782,-0.36476135,8.793503,1.0550842
1,1600,A,252207717164786,-0.8797302,9.768784,1.0169983
2,1600,A,252207767518790,2.0014954,11.10907,2.619156
3,1600,A,252207817872794,0.45062256,12.651642,0.18455505
4,1600,A,252207868226798,-2.1643524,13.928436,-4.4224854


In [11]:
label = train['Activity Label'].unique()

In [12]:
l={}
n=0
for i in label:
    l[i] = n+1
    n+=1

train['Activity Label'] = train['Activity Label'].apply(lambda x: l[x])

In [13]:
testphoneaccel = process('/content/drive/MyDrive/Raw/raw/test/phone/accel')
testphonegyro = process('/content/drive/MyDrive/Raw/raw/test/phone/gyro')
testwatchaccel = process('/content/drive/MyDrive/Raw/raw/test/watch/accel')
testwatchgyro = process('/content/drive/MyDrive/Raw/raw/test/watch/gyro')

In [14]:
test = testphoneaccel + testphonegyro + testwatchaccel + testwatchgyro
len(test)

4266387

In [15]:
test = transform(test)

In [16]:
test = test.dropna(subset = ['Subject-id','Timestamp', 'Activity Label','x', 'y', 'z'])

In [17]:
test['Timestamp'] = pd.to_numeric(test["Timestamp"])

In [18]:
label = test['Activity Label'].unique()

In [19]:
l={}
n=0
for i in label:
    l[i] = n+1
    n+=1

test['Activity Label'] = test['Activity Label'].apply(lambda x: l[x])

In [20]:
train['Timestamp'] = pd.to_numeric(train["Timestamp"])
train['Subject-id']=pd.to_numeric(train["Subject-id"])
train['x']=pd.to_numeric(train["x"])
train['y']=pd.to_numeric(train["y"])
train['z']=pd.to_numeric(train["z"])
train['Timestamp'] = train['Timestamp'].fillna(0)
train['Subject-id'] = train['Subject-id'].fillna(0)
train['x'] = train['x'].fillna(0)
train['y'] = train['y'].fillna(0)
train['z'] = train['z'].fillna(0)

In [21]:
train.drop(columns="Subject-id",inplace=True)
test.drop(columns="Subject-id",inplace=True)

In [22]:
train = train.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)
data=pd.DataFrame()
data=pd.concat([train,test])

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
y=data["Activity Label"]
x=data.drop(columns="Activity Label")
x_train, x_test, y_train, y_test = train_test_split(x,y , train_size = 0.7, random_state =  51)

In [25]:
from sklearn.neighbors import KNeighborsClassifier

In [26]:
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [27]:
classifier.score(x_test,y_test)

0.7431154097967536

In [30]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(classifier, open(filename, 'wb'))


In [32]:
y= classifier.predict(x_test)

In [33]:
df = pd.DataFrame(y)

In [34]:
df.to_csv('answer.csv')