<h3>Decision Tree - Respondent 1 & 3 </h3>

<h2>Pre-processing</h2>

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

# Load the CSV file
file_path = '001-b45731a0_emotions_with_cognitive_tesk_and_screen.csv'
data = pd.read_csv(file_path)

# Load the second CSV file
new_file_path = '003-91dc3428_emotions_with_cognitive_tesk_and_screen.csv'
new_data = pd.read_csv(new_file_path)

# Merge the two dataframes
data = pd.concat([data, new_data])

# Drop the Timestamp column
data = data.drop('Timestamp', axis=1)

# Drop rows with NaN values in the target column
data = data.dropna(subset=['Cognitive Task'])
print(data.head())
print(data.shape[0])

# Count unique values
# category_counts = data['Cognitive Task'].value_counts()

# # Plot the Cognitive Task values distribution
# sns.barplot(x=category_counts.index, y=category_counts.values)
# plt.xlabel('Cognitive Task')
# plt.ylabel('Number of Instances')
# plt.title('Distribution of Cognitive Task Categories')

# # Rotate x-axis labels by 45 degrees
# plt.xticks(rotation=45)

# plt.show()


# Select only relevant columns
selected_features = ['Anger', 'Contempt', 'Disgust', 'Fear', 'Surprise', 'Confusion', 'Cognitive Task', 'Screens']
data = data[selected_features]

# Define the categories from the 'screens' column you want to include as features
selected_screens = ['Filter', 'Supporting material', 'Map', 'Statistics']

# Identify numeric and categorical features, excluding the target variable
all_columns = data.columns.tolist()
target_column = 'Cognitive Task'
all_columns.remove(target_column)
numeric_features = data[all_columns].select_dtypes(include=['int64', 'float64']).columns
categorical_features = ['Screens']

# Exclude the target column from the numeric features
numeric_features = numeric_features.drop('Cognitive Task', errors='ignore')

# Define the preprocessing steps for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(categories=[selected_screens], handle_unknown='ignore', sparse_output=False))])

# Combine the preprocessing steps into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Preprocess the data
y = data['Cognitive Task']
X = data.drop('Cognitive Task', axis=1)
X_preprocessed = preprocessor.fit_transform(X)

# Combine the numeric and transformed categorical feature names
all_feature_names = list(numeric_features) + selected_screens

# Convert the preprocessed data back to a DataFrame
X_preprocessed = pd.DataFrame(X_preprocessed, columns=all_feature_names)

# Reset the index of y
y.reset_index(drop=True, inplace=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)


         Anger  Contempt   Disgust      Fear       Joy   Sadness  Surprise  \
3705  0.125566  0.136948  0.021518  0.116770  0.041248  0.112264  0.059114   
3706  0.125566  0.137617  0.021531  0.132963  0.040812  0.112249  0.067522   
3707  0.125566  0.137096  0.021582  0.169795  0.040925  0.111568  0.087434   
3708  0.125566  0.136868  0.021714  0.218535  0.041063  0.111219  0.114075   
3709  0.125566  0.137239  0.021806  0.256706  0.040785  0.110944  0.135015   

      Engagement  Valence  Sentimentality  Confusion Cognitive Task Screens  
3705    0.328766      0.0        0.976782   0.036977        Explore     Map  
3706    0.328766      0.0        0.911952   0.034095        Explore     Map  
3707    0.328766      0.0        0.831274   0.026797        Explore     Map  
3708    0.328766      0.0        0.829263   0.019318        Explore     Map  
3709    0.328766      0.0        0.773583   0.014509        Explore     Map  
148759


<h2>Decision Tree  Model</h2>

In [10]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Create an instance of the DecisionTreeClassifier class
dt = DecisionTreeClassifier(ccp_alpha=0.1)

# Train the Decision Tree model
dt.fit(X_train, y_train)

# Test the model
y_pred = dt.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))


Confusion Matrix:
 [[6001  315  318   48  179  510  117]
 [ 281 2113  190   48   81  239  105]
 [ 290  200 4408   52   75  161  124]
 [  58   42   58 1505   16    4  368]
 [ 196   70   90    8 1628   84   41]
 [ 534  210  192    9  103 3533    1]
 [ 112  105  118  398   43    3 4368]]

Classification Report:
                      precision    recall  f1-score   support

     Assess Results       0.80      0.80      0.80      7488
           Conclude       0.69      0.69      0.69      3057
            Explore       0.82      0.83      0.83      5310
              Focus       0.73      0.73      0.73      2051
Generate Hypothesis       0.77      0.77      0.77      2117
           Set Goal       0.78      0.77      0.78      4582
    Test Hypothesis       0.85      0.85      0.85      5147

           accuracy                           0.79     29752
          macro avg       0.78      0.78      0.78     29752
       weighted avg       0.79      0.79      0.79     29752


Accuracy Score

<h3>With Hyperparameters</h3>

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define hyperparameter search space
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3, 4],
    'ccp_alpha': [0.0, 0.1, 0.2, 0.3]
}

# Create an instance of the DecisionTreeClassifier class
dt = DecisionTreeClassifier()

# Initialize a GridSearchCV instance to find the best hyperparameters
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy')

# Train the Decision Tree model with the best hyperparameters
grid_search.fit(X_train, y_train)

# Check the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Test the model
y_pred = grid_search.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))


<h3>Cross Validation </h3>

In [12]:
from sklearn.model_selection import cross_val_score

# Use 5-fold cross-validation to evaluate the model
cv_scores = cross_val_score(dt, X_preprocessed, y, cv=5)
print("Average cross-validation score:", np.mean(cv_scores))


Average cross-validation score: 0.36561799360252517
