# Let's create classification model for Our Cleaned WISDM Dataset
So for classification we have Logistic regression and RandomforestClassifier
We want are model to predict the users activity babaes on the 

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score                 

In [9]:
df = pd.read_csv('cleaned_wisdm_data.csv')
df.head()

Unnamed: 0,user,activity,timestamp,x,y,z
0,33,Jogging,49105962326000,-0.694638,12.680544,0.503953
1,33,Jogging,49106062271000,5.012288,11.264028,0.953424
2,33,Jogging,49106112167000,4.903325,10.882658,-0.081722
3,33,Jogging,49106222305000,-0.612916,18.496431,3.023717
4,33,Jogging,49106332290000,-1.18497,12.108489,7.205164


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1086465 entries, 0 to 1086464
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   user       1086465 non-null  int64  
 1   activity   1086465 non-null  object 
 2   timestamp  1086465 non-null  int64  
 3   x          1086465 non-null  float64
 4   y          1086465 non-null  float64
 5   z          1086465 non-null  float64
dtypes: float64(3), int64(2), object(1)
memory usage: 49.7+ MB


In [11]:
df.isnull().sum()

user         0
activity     0
timestamp    0
x            0
y            0
z            0
dtype: int64

> For the model, `user` and `timestamp` are not needed so only the `x`, `y` and `z` will be the only features used for the model making the activity or target.


In [12]:
df = df.drop(['timestamp','user'], axis=1)
df.head()

Unnamed: 0,activity,x,y,z
0,Jogging,-0.694638,12.680544,0.503953
1,Jogging,5.012288,11.264028,0.953424
2,Jogging,4.903325,10.882658,-0.081722
3,Jogging,-0.612916,18.496431,3.023717
4,Jogging,-1.18497,12.108489,7.205164


In [13]:
X = df.drop('activity', axis=1)
y = df['activity']

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
first_model = LogisticRegression(solver='saga',multi_class="ovr",max_iter=50)
first_model.fit(X_train, y_train)
y_pred = first_model.predict(X_test)   

In [17]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of Logistic Regression: {accuracy*100:.2f}%')

Accuracy of Logistic Regression: 47.11%


In [18]:
model_2 = RandomForestClassifier(n_estimators=200,
    random_state=42,
    n_jobs=-1)
model_2.fit(X_train, y_train)
model_2.predict(X_test)

array(['Jogging', 'Jogging', 'Jogging', ..., 'Jogging', 'Walking',
       'Walking'], dtype=object)

In [19]:
model_2.score(X_test,y_test)

0.6422204120703382

In [20]:
# ==========================================
# HUMAN ACTIVITY RECOGNITION MODEL (WISDM)
# ==========================================

import pandas as pd
import numpy as np
from scipy.stats import entropy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



WINDOW_SIZE = 100  # smaller window to ensure enough samples

segments = []
labels = []

for i in range(0, len(df) - WINDOW_SIZE, WINDOW_SIZE):
    window = df.iloc[i : i + WINDOW_SIZE]
    x = window['x'].values
    y = window['y'].values
    z = window['z'].values

    # Use majority (most frequent) activity label in the window
    activity = window['activity'].mode()[0]

    # Safe correlation function
    def safe_corr(a, b):
        if np.std(a) == 0 or np.std(b) == 0:
            return 0
        return np.corrcoef(a, b)[0, 1]

    # Extract time-domain features
    features = [
        np.mean(x), np.mean(y), np.mean(z),
        np.std(x), np.std(y), np.std(z),
        np.min(x), np.min(y), np.min(z),
        np.max(x), np.max(y), np.max(z),
        np.sqrt(np.var(x) + np.var(y) + np.var(z)),        # signal magnitude std
        np.mean(np.abs(x) + np.abs(y) + np.abs(z)),        # SMA
        entropy(np.abs(x) + 1e-10),                        # avoid log(0)
        entropy(np.abs(y) + 1e-10),
        entropy(np.abs(z) + 1e-10),
        safe_corr(x, y), safe_corr(x, z), safe_corr(y, z)
    ]

    segments.append(features)
    labels.append(activity)

print(f"✅ Created {len(segments)} segments from {len(df)} samples.")

# Convert to DataFrame
columns = [
    'mean_x', 'mean_y', 'mean_z',
    'std_x', 'std_y', 'std_z',
    'min_x', 'min_y', 'min_z',
    'max_x', 'max_y', 'max_z',
    'magnitude_std', 'SMA',
    'entropy_x', 'entropy_y', 'entropy_z',
    'corr_xy', 'corr_xz', 'corr_yz'
]

X = pd.DataFrame(segments, columns=columns)
y = pd.Series(labels, name='activity')


# 3. Split and train the model

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

# -----------------------------
# 4. Evaluate the model
# -----------------------------
y_pred = model.predict(X_test)

print("\n✅ Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🔹 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Created 10864 segments from 1086465 samples.

✅ Accuracy: 0.9549

📊 Classification Report:
               precision    recall  f1-score   support

  Downstairs       0.91      0.77      0.84       201
     Jogging       0.99      0.99      0.99       674
     Sitting       1.00      0.97      0.98       119
    Standing       0.99      0.99      0.99        97
    Upstairs       0.85      0.85      0.85       245
     Walking       0.96      0.99      0.98       837

    accuracy                           0.95      2173
   macro avg       0.95      0.93      0.94      2173
weighted avg       0.95      0.95      0.95      2173


🔹 Confusion Matrix:
 [[155   2   0   0  26  18]
 [  0 669   0   0   2   3]
 [  0   0 115   1   3   0]
 [  0   0   0  96   1   0]
 [ 14   7   0   0 209  15]
 [  1   0   0   0   5 831]]


In [23]:
import pickle
from sklearn.pipeline import make_pipeline

# Create a pipeline with scaler and RandomForestClassifier
pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
)
pipeline.fit(X_train, y_train)

# Save the pipeline
with open('activity_classifier.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

print("Model saved using pickle!")



Model saved using pickle!
