In [6]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [7]:
# Path to the data folders
data_path = r'E:\Research paper\motion watch 8\Raw Data'
categories = ['Crime', 'Historical', 'Horror', 'Romance']

In [8]:
# Function to load data
def load_data(data_path, categories):
    data = []
    labels = []
    for category in categories:
        category_path = os.path.join(data_path, category)
        for file in os.listdir(category_path):
            if file.endswith('.xlsx') or file.endswith('.xls'):
                file_path = os.path.join(category_path, file)
                df = pd.read_excel(file_path)
                data.append(df)
                labels.append(category)
    return data, labels

In [9]:
# Load the data
data, labels = load_data(data_path, categories)

In [10]:
# Check if data is loaded correctly
if not data:
    raise ValueError("No data found. Please check the data path and folder structure.")
else:
    print(f"Loaded data for {len(data)} files.")

Loaded data for 80 files.


In [11]:
# Print column names for debugging
for i, df in enumerate(data):
    print(f"File {i} columns: {df.columns.tolist()}")

File 0 columns: ['Date', 'Time', 'Activity (Triaxial-Counts)']
File 1 columns: ['Date', 'Time', 'Activity (Triaxial-Counts)']
File 2 columns: ['Date', 'Time', 'Activity (Triaxial-Counts)']
File 3 columns: ['Date', 'Time', 'Activity (Triaxial-Counts)']
File 4 columns: ['Date', 'Time', 'Activity (Triaxial-Counts)']
File 5 columns: ['Date', 'Time', 'Activity (Triaxial-Counts)']
File 6 columns: ['Date', 'Time', 'Activity (Triaxial-Counts)']
File 7 columns: ['Date', 'Time', 'Activity (Triaxial-Counts)', 'Light (lux)']
File 8 columns: ['Date', 'Time', 'Activity (Triaxial-Counts)']
File 9 columns: ['Date', 'Time', 'Activity (Triaxial-Counts)']
File 10 columns: ['Date', 'Time', 'Activity (Triaxial-Counts)']
File 11 columns: ['Date', 'Time', 'Activity (Triaxial-Counts)']
File 12 columns: ['Date', 'Time', 'Activity (Triaxial-Counts)']
File 13 columns: ['Date', 'Time', 'Activity (Triaxial-Counts)']
File 14 columns: ['Date', 'Time', 'Activity (Triaxial-Counts)', 'Light (lux)']
File 15 columns: ['D

In [12]:
# Preprocess and extract features
def preprocess_data(data):
    # Extract features from the raw data
    feature_data = []
    for df in data:
        if df.empty:
            continue
        
        # Check and rename columns if necessary
        if 'Activity (Triaxial-Counts)' not in df.columns:
            raise KeyError(f"Expected column 'Activity (Triaxial-Counts)' not found in dataframe with columns {df.columns.tolist()}")
        
        # Example feature extraction: mean and std of triaxial counts
        mean_activity = df['Activity (Triaxial-Counts)'].mean()
        std_activity = df['Activity (Triaxial-Counts)'].std()
        feature_data.append([mean_activity, std_activity])
    return pd.DataFrame(feature_data, columns=['MeanActivity', 'StdActivity'])


In [13]:
# Preprocess data
X = preprocess_data(data)

# Check if features are extracted correctly
if X.empty:
    raise ValueError("Feature extraction failed. No features extracted.")
else:
    print(f"Extracted features for {X.shape[0]} files.")

y = labels[:len(X)]  # Match labels to the number of successfully processed files


Extracted features for 80 files.


In [14]:
# Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)


In [16]:
# Train the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [17]:
# Predict on the test set
y_pred = clf.predict(X_test)


In [18]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)


In [19]:
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')


Accuracy: 0.375
Classification Report:
              precision    recall  f1-score   support

       Crime       0.38      0.43      0.40         7
  Historical       0.50      0.50      0.50         6
      Horror       0.40      0.40      0.40         5
     Romance       0.20      0.17      0.18         6

    accuracy                           0.38        24
   macro avg       0.37      0.37      0.37        24
weighted avg       0.37      0.38      0.37        24



In [20]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train,y_train)

In [21]:
# Predict on the test set
y_pred = clf.predict(X_test)


In [22]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')


Accuracy: 0.20833333333333334
Classification Report:
              precision    recall  f1-score   support

       Crime       0.00      0.00      0.00         7
  Historical       0.00      0.00      0.00         6
      Horror       0.21      1.00      0.34         5
     Romance       0.00      0.00      0.00         6

    accuracy                           0.21        24
   macro avg       0.05      0.25      0.09        24
weighted avg       0.04      0.21      0.07        24



In [24]:
from sklearn.tree import DecisionTreeClassifier
clf =DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.3333333333333333
Classification Report:
              precision    recall  f1-score   support

       Crime       0.43      0.43      0.43         7
  Historical       0.43      0.50      0.46         6
      Horror       0.20      0.20      0.20         5
     Romance       0.20      0.17      0.18         6

    accuracy                           0.33        24
   macro avg       0.31      0.32      0.32        24
weighted avg       0.32      0.33      0.33        24



In [25]:
from sklearn.linear_model import LogisticRegression
clf =LogisticRegression(max_iter=100)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.25
Classification Report:
              precision    recall  f1-score   support

       Crime       0.33      0.14      0.20         7
  Historical       0.30      0.50      0.37         6
      Horror       0.17      0.20      0.18         5
     Romance       0.20      0.17      0.18         6

    accuracy                           0.25        24
   macro avg       0.25      0.25      0.23        24
weighted avg       0.26      0.25      0.24        24



In [26]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.2916666666666667
Classification Report:
              precision    recall  f1-score   support

       Crime       0.38      0.43      0.40         7
  Historical       0.33      0.50      0.40         6
      Horror       0.25      0.20      0.22         5
     Romance       0.00      0.00      0.00         6

    accuracy                           0.29        24
   macro avg       0.24      0.28      0.26        24
weighted avg       0.24      0.29      0.26        24



In [27]:
from sklearn.naive_bayes import MultinomialNB
clf= MultinomialNB()
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.3333333333333333
Classification Report:
              precision    recall  f1-score   support

       Crime       0.38      0.43      0.40         7
  Historical       0.38      0.83      0.53         6
      Horror       0.00      0.00      0.00         5
     Romance       0.00      0.00      0.00         6

    accuracy                           0.33        24
   macro avg       0.19      0.32      0.23        24
weighted avg       0.21      0.33      0.25        24



In [28]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.375
Classification Report:
              precision    recall  f1-score   support

       Crime       0.60      0.43      0.50         7
  Historical       0.30      0.50      0.37         6
      Horror       0.25      0.40      0.31         5
     Romance       1.00      0.17      0.29         6

    accuracy                           0.38        24
   macro avg       0.54      0.37      0.37        24
weighted avg       0.55      0.38      0.38        24

