In [81]:
import pandas as pd
import numpy as np 
import matplotlib as pyplot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

In [14]:
data = []

with open('WISDM_ar_v1.1_raw.txt') as f:
    for line in f:
        try:
            parts = line.strip().split(',')
            if len(parts) < 6: continue
            user, activity, timestamp, x, y, z = parts[:6]
            z = z.split(';')[0]  # remove ';' if exists
            data.append([user, activity, timestamp, x, y, z])
        except:
            continue

df = pd.DataFrame(data, columns=['user', 'activity', 'timestamp', 'x', 'y', 'z'])

In [16]:
df

Unnamed: 0,user,activity,timestamp,x,y,z
0,33,Jogging,49105962326000,-0.6946377,12.680544,0.50395286
1,33,Jogging,49106062271000,5.012288,11.264028,0.95342433
2,33,Jogging,49106112167000,4.903325,10.882658,-0.08172209
3,33,Jogging,49106222305000,-0.61291564,18.496431,3.0237172
4,33,Jogging,49106332290000,-1.1849703,12.108489,7.205164
...,...,...,...,...,...,...
1098199,19,Sitting,131623331483000,9,-1.57,1.69
1098200,19,Sitting,131623371431000,9.04,-1.46,1.73
1098201,19,Sitting,131623411592000,9.08,-1.38,1.69
1098202,19,Sitting,131623491487000,9,-1.46,1.73


In [18]:
df.isnull().sum()

user         0
activity     0
timestamp    0
x            0
y            0
z            0
dtype: int64

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098204 entries, 0 to 1098203
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   user       1098204 non-null  object
 1   activity   1098204 non-null  object
 2   timestamp  1098204 non-null  object
 3   x          1098204 non-null  object
 4   y          1098204 non-null  object
 5   z          1098204 non-null  object
dtypes: object(6)
memory usage: 50.3+ MB


In [24]:
df.columns

Index(['user', 'activity', 'timestamp', 'x', 'y', 'z'], dtype='object')

In [26]:
le=LabelEncoder()

In [28]:
df['activity']=le.fit_transform(df['activity'])

In [30]:
df

Unnamed: 0,user,activity,timestamp,x,y,z
0,33,1,49105962326000,-0.6946377,12.680544,0.50395286
1,33,1,49106062271000,5.012288,11.264028,0.95342433
2,33,1,49106112167000,4.903325,10.882658,-0.08172209
3,33,1,49106222305000,-0.61291564,18.496431,3.0237172
4,33,1,49106332290000,-1.1849703,12.108489,7.205164
...,...,...,...,...,...,...
1098199,19,2,131623331483000,9,-1.57,1.69
1098200,19,2,131623371431000,9.04,-1.46,1.73
1098201,19,2,131623411592000,9.08,-1.38,1.69
1098202,19,2,131623491487000,9,-1.46,1.73


In [32]:
scaler=StandardScaler()

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098204 entries, 0 to 1098203
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   user       1098204 non-null  object
 1   activity   1098204 non-null  int32 
 2   timestamp  1098204 non-null  object
 3   x          1098204 non-null  object
 4   y          1098204 non-null  object
 5   z          1098204 non-null  object
dtypes: int32(1), object(5)
memory usage: 46.1+ MB


In [54]:
print(df.dtypes)

user          object
activity       int32
timestamp     object
x            float64
y            float64
z             object
dtype: object


In [56]:
print(df[['x', 'y', 'z']].head(10))

          x          y            z
0 -0.694638  12.680544   0.50395286
1  5.012288  11.264028   0.95342433
2  4.903325  10.882658  -0.08172209
3 -0.612916  18.496431    3.0237172
4 -1.184970  12.108489     7.205164
5  1.375655  -2.492524    -6.510526
6 -0.612916  10.569390     5.706926
7 -0.503953  13.947236    7.0553403
8 -8.430995  11.413852     5.134871
9  0.953424   1.375655    1.6480621


In [58]:
df['z'] = df['z'].astype(str).str.replace(';', '', regex=False)

In [60]:
df['x'] = pd.to_numeric(df['x'], errors='coerce')
df['y'] = pd.to_numeric(df['y'], errors='coerce')
df['z'] = pd.to_numeric(df['z'], errors='coerce')

In [62]:
print(df.dtypes)

user          object
activity       int32
timestamp     object
x            float64
y            float64
z            float64
dtype: object


In [66]:
X=scaler.fit_transform(df[['x', 'y', 'z']])

In [68]:
X

array([[-0.19820336,  0.80414215,  0.01953915],
       [ 0.63503947,  0.59416976,  0.11408298],
       [ 0.61913026,  0.53763869, -0.10365431],
       ...,
       [ 1.228948  , -1.28007437,  0.26901757],
       [ 1.21726755, -1.2919329 ,  0.27743135],
       [ 1.19974689, -1.27266279,  0.25219001]])

In [73]:
y = df['activity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [75]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [79]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.33      0.13      0.18     20197
           1       0.71      0.71      0.71     68419
           2       0.98      0.98      0.98     12024
           3       0.83      0.87      0.85      9781
           4       0.34      0.14      0.20     24764
           5       0.60      0.78      0.67     84456

    accuracy                           0.64    219641
   macro avg       0.63      0.60      0.60    219641
weighted avg       0.61      0.64      0.61    219641



In [83]:
joblib.dump(model, 'fitness_model.pkl')

['fitness_model.pkl']