<a href="https://colab.research.google.com/github/jamalimubashirali/Machine-Learning-Projects/blob/main/personality_prediction_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [63]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [64]:
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/Kaggle/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [65]:
!kaggle competitions download -c playground-series-s5e7
!unzip playground-series-s5e7.zip

playground-series-s5e7.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  playground-series-s5e7.zip
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [66]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score

# Classifiers
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB, CategoricalNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier,
                              AdaBoostClassifier, BaggingClassifier, HistGradientBoostingClassifier,
                              StackingClassifier, VotingClassifier)
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [67]:
df = pd.read_csv('train.csv' , index_col='id')
test_df = pd.read_csv('test.csv' , index_col='id')

In [68]:
df.head()

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18524 entries, 0 to 18523
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_spent_Alone           17334 non-null  float64
 1   Stage_fear                 16631 non-null  object 
 2   Social_event_attendance    17344 non-null  float64
 3   Going_outside              17058 non-null  float64
 4   Drained_after_socializing  17375 non-null  object 
 5   Friends_circle_size        17470 non-null  float64
 6   Post_frequency             17260 non-null  float64
 7   Personality                18524 non-null  object 
dtypes: float64(5), object(3)
memory usage: 1.3+ MB


In [70]:
#
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Time_spent_Alone,17334.0,3.137764,3.003786,0.0,1.0,2.0,4.0,11.0
Social_event_attendance,17344.0,5.265106,2.753359,0.0,3.0,5.0,8.0,10.0
Going_outside,17058.0,4.044319,2.06258,0.0,3.0,4.0,6.0,7.0
Friends_circle_size,17470.0,7.996737,4.223484,0.0,5.0,8.0,12.0,15.0
Post_frequency,17260.0,4.982097,2.879139,0.0,3.0,5.0,7.0,10.0


In [71]:
# Splitting Data
df['Personality'] = df['Personality'].map({'Extrovert': 0, 'Introvert': 1})
X = df.drop(columns=['Personality'])
y = df['Personality']

In [72]:
# Defining Columns to be transformed
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
categorical_columns = ['Stage_fear' , 'Drained_after_socializing']
numeric_columns = X.drop(columns=categorical_columns).columns

In [73]:
# Numeric Data Cleaning and Scaling
numeric_transformer = Pipeline(steps=[
    ('imputer' , SimpleImputer(strategy='mean')),
    ('scaler' , StandardScaler())
])

In [74]:
categorical_transformer = Pipeline(
    steps=[
    ('imputer' , SimpleImputer(strategy='most_frequent')),
    ('encoder' , OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [75]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric' , numeric_transformer , numeric_columns),
        ('categorical' , categorical_transformer , categorical_columns)
    ]
)

In [83]:
# List of classifiers to evaluate
classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=500),
    "RidgeClassifier": RidgeClassifier(),
    "SGDClassifier": SGDClassifier(),
    "Perceptron": Perceptron(),
    "PassiveAggressiveClassifier": PassiveAggressiveClassifier(),
    "GaussianNB": GaussianNB(),
    "MultinomialNB": MultinomialNB(),
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "CategoricalNB": CategoricalNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "NearestCentroid": NearestCentroid(),
    "SVC": SVC(),
    "LinearSVC": LinearSVC(),
    "NuSVC": NuSVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "ExtraTreesClassifier": ExtraTreesClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "BaggingClassifier": BaggingClassifier(),
    "HistGradientBoostingClassifier": HistGradientBoostingClassifier(),
    "MLPClassifier": MLPClassifier(max_iter=300),
    "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
    "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
    "DummyClassifier": DummyClassifier(),
    "VotingClassifier": VotingClassifier(estimators=[('gb', GradientBoostingClassifier()), ('ad', AdaBoostClassifier())]),
    "StackingClassifier": StackingClassifier(estimators=[('lr', GradientBoostingClassifier()), ('ad', AdaBoostClassifier())]),
    "XGBoostClassifer" : XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Train and evaluate each classifier
results = {}


In [84]:
# Making Predictions using different models
for name, clf in classifiers.items():
    try:
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', clf)
        ])
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_val)
        acc = accuracy_score(y_val, preds)
        results[name] = acc
    except Exception as e:
        results[name] = f"Error: {str(e)}"

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [85]:
print("\\nModel Accuracy Scores:")
for name, score in results.items():
    print(f"{name}: {score}")

\nModel Accuracy Scores:
LogisticRegression: 0.9697787371829466
RidgeClassifier: 0.9686994063680519
SGDClassifier: 0.9692390717754992
Perceptron: 0.9686994063680519
PassiveAggressiveClassifier: 0.9692390717754992
GaussianNB: 0.9697787371829466
MultinomialNB: Error: Negative values in data passed to MultinomialNB (input X).
BernoulliNB: 0.9686994063680519
ComplementNB: Error: Negative values in data passed to ComplementNB (input X).
CategoricalNB: Error: Negative values in data passed to CategoricalNB (input X).
KNeighborsClassifier: 0.9697787371829466
NearestCentroid: 0.9681597409606044
SVC: 0.9697787371829466
LinearSVC: 0.9692390717754992
NuSVC: 0.9697787371829466
DecisionTreeClassifier: 0.9400971397733405
RandomForestClassifier: 0.9692390717754992
ExtraTreesClassifier: 0.9654614139233675
GradientBoostingClassifier: 0.9703184025903939
AdaBoostClassifier: 0.9692390717754992
BaggingClassifier: 0.9611440906637885
HistGradientBoostingClassifier: 0.9703184025903939
MLPClassifier: 0.9697787

In [79]:
model_1 = Pipeline(
    steps=[
        ('preprocessor' , preprocessor),
        ('model' , GradientBoostingClassifier())
    ]
)
model_2 = Pipeline(
    steps=[
        ('preprocessor' , preprocessor),
        ('model' , AdaBoostClassifier())
    ]
)

model_3 = Pipeline(
    steps=[
        ('preprocessor' , preprocessor),
        ('model' , HistGradientBoostingClassifier())
    ]
)

In [80]:
# Training model_1 Whole data set
model_1.fit(X , y)
predictions = model_1.predict(test_df)
gradient_boost_model_predictions = pd.DataFrame({
    'id' : list(test_df.index),
    'Personality' : predictions
})
gradient_boost_model_predictions.to_csv('submission_GB.csv' , index=False)

In [81]:
# Training model_2 Whole data set
model_2.fit(X , y)
predictions = model_2.predict(test_df)
ada_boost_model_predictions = pd.DataFrame({
    'id' : list(test_df.index),
    'Personality' : predictions
})
ada_boost_model_predictions.to_csv('submission_AB.csv' , index=False)

In [82]:
# Training model_2 Whole data set
model_3.fit(X , y)
predictions = model_3.predict(test_df)
high_gradient_boost_model_predictions = pd.DataFrame({
    'id' : list(test_df.index),
    'Personality' : predictions
})
high_gradient_boost_model_predictions.to_csv('submission_HGB.csv' , index=False)