In [2]:
import matplotlib.pyplot as plt # For plotting
import numpy as np              # Linear algebra library
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

from google.colab import files
uploaded = files.upload()
data = pd.read_csv("clean_dataset.csv")

Saving clean_dataset.csv to clean_dataset.csv


In [34]:
"""
Preprocessing steps
"""

import re
from sklearn.feature_extraction.text import TfidfVectorizer

data = pd.read_csv("clean_dataset.csv")
expected_columns = ['id', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Label']

# Filter out rows that don't have all the expected columns
data = data.dropna(subset=expected_columns)

data = data.drop(['id'], axis=1)

categories = ["Siblings", "Co-worker", "Partner", "Friends"]
for category in categories:
    data[category] = data['Q5'].apply(lambda x: 1 if isinstance(x, str) and category in x.split(',') else 0)

# Drop the original 'Q5' column
data = data.drop(['Q5'], axis=1)

def extract_rankings(text):
    rankings = dict(item.split('=>') for item in text.split(','))
    return {k: int(v) if v else 0 for k, v in rankings.items()}

# Apply the function to the 'Q6' column and expand it into multiple columns
rankings_df = data['Q6'].apply(extract_rankings).apply(pd.Series)

# Concatenate the new columns with the original DataFrame, dropping the 'Q6' column
data = pd.concat([data.drop('Q6', axis=1), rankings_df], axis=1)

numeric_columns = ['Q7', 'Q8', 'Q9']
for col in numeric_columns:
    if data[col].dtype == object:  # Check if the column is of object (string) type
        data[col] = data[col].str.replace(',', '').astype(float)

def clean_text(text):
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # Keep alphanumeric characters only (including spaces)
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    return text

data['Q10'] = [clean_text(text) for text in data['Q10']]

print(data.head())

    Q1   Q2   Q3   Q4    Q7    Q8     Q9  \
0  4.0  1.0  2.0  1.0  25.0   7.0  100.0   
1  4.0  3.0  5.0  2.0  20.0   3.0    4.0   
2  5.0  4.0  5.0  1.0  32.0   5.0    4.0   
3  5.0  4.0  4.0  1.0  23.0  10.0    3.0   
4  4.0  3.0  3.0  3.0  20.0  10.0    5.0   

                                                 Q10  Label  Siblings  \
0                                            Slavery  Dubai         0   
1  Wherever there is great property there is grea...  Dubai         0   
2                                    Futuristic land  Dubai         0   
3                The city where anything is possible  Dubai         0   
4  If you can think of a high building it probabl...  Dubai         1   

   Co-worker  Partner  Friends  Skyscrapers  Sport  Art and Music  Carnival  \
0          1        0        0            6      4              2         1   
1          1        0        0            6      1              2         3   
2          0        1        1            6      2         

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
X = data.drop(['Label'], axis=1)
y = data['Label']

# Split the data into training, validation, and test sets (before TF-IDF)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 1), stop_words='english', min_df=5)

# Fit and transform the 'Q10' column of the training data
Q10_train_tfidf = tfidf_vectorizer.fit_transform(X_train['Q10']).toarray()
Q10_test_tfidf = tfidf_vectorizer.transform(X_test['Q10']).toarray()

# Create DataFrames for the TF-IDF features
tfidf_columns = tfidf_vectorizer.get_feature_names_out()
Q10_train_df = pd.DataFrame(Q10_train_tfidf, columns=tfidf_columns, index=X_train.index)
Q10_test_df = pd.DataFrame(Q10_test_tfidf, columns=tfidf_columns, index=X_test.index)

# Drop the original 'Q10' column from the splits
X_train = X_train.drop('Q10', axis=1)
X_test = X_test.drop('Q10', axis=1)

# Concatenate the TF-IDF features with the other features
X_train_final = pd.concat([X_train, Q10_train_df], axis=1)
X_test_final = pd.concat([X_test, Q10_test_df], axis=1)

# Encode the labels
encoder = OneHotEncoder(sparse=False)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_test_scaled = scaler.transform(X_test_final)

input_dim = X_train_scaled.shape[1]



In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

# Define a parameter grid to search over
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

poly = PolynomialFeatures(degree=2)  # Experiment with the degree
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

lr_model = LogisticRegression(max_iter=1000, C=0.01)
# Fit logistic regression on the polynomial features
lr_model.fit(X_train_poly, y_train)
y_test_pred = lr_model.predict(X_test_poly)
test_acc = accuracy_score(y_test, y_test_pred)
print("Test Accuracy with Polynomial Features:", test_acc)



# Initialize the GridSearchCV object
#grid_search = GridSearchCV(LogisticRegression(max_iter=10000), param_grid, cv=5)

# Fit it to the scaled training data
#grid_search.fit(X_train_poly, y_train)

# Print the best parameters and the corresponding accuracy
#print("Best Parameters:", grid_search.best_params_)
#print("Best CV Score:", grid_search.best_score_)

# Evaluate on the test set
#y_test_pred = grid_search.predict(X_test_poly)
#test_acc = accuracy_score(y_test, y_test_pred)
#print("GridSearchCV Test Acc:", test_acc)

Test Accuracy with Polynomial Features: 0.8846153846153846
