# Hanna Wierszok

In [31]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
train_data = pd.read_csv('/content/sample_data/CAH-201803-train.csv')
train_data

Unnamed: 0,id_num,Q1,Q2,political_affiliation,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,1,Male,53,Independent,Liberal,College degree,Black,No,No,No,"Yes, somewhat religious",Pro-Choice,No,No,Behave no differently,5,2,5,No
1,5,Female,66,Independent,Conservative,Some college,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,Yes,Less Willing,4,5,4,No
2,7,Female,58,Democrat,Liberal,College degree,White,No,No,No,"Yes, very religious",Pro-Choice,No,No,Behave no differently,5,1,4,Yes
3,8,Male,55,Independent,Moderate,High school or less,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,Yes,Yes,Less Willing,4,5,4,Yes
4,9,Male,64,Republican,Conservative,High school or less,White,Yes,Yes,Yes,No,Pro-life,No,No,Behave no differently,5,1,1,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,325,Male,21,Republican,Conservative,Some college,White,Yes,No,Yes,No,Pro-Choice,Yes,No,Less Willing,5,2,5,No
165,328,Female,41,Republican,Liberal,Graduate degree,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-Choice,Yes,No,Less Willing,5,2,2,No
166,329,Male,60,Republican,Conservative,Some college,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,No,Yes,Behave no differently,5,5,4,Yes
167,332,Female,51,Republican,Conservative,Graduate degree,White,Yes,Yes,Yes,"Yes, very religious",Pro-life,Yes,No,Less Willing,2,5,1,No


In [32]:
train_data_cleaned = train_data.drop('id_num', axis=1).dropna()
train_data_cleaned

Unnamed: 0,Q1,Q2,political_affiliation,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,Male,53,Independent,Liberal,College degree,Black,No,No,No,"Yes, somewhat religious",Pro-Choice,No,No,Behave no differently,5,2,5,No
1,Female,66,Independent,Conservative,Some college,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,Yes,Less Willing,4,5,4,No
2,Female,58,Democrat,Liberal,College degree,White,No,No,No,"Yes, very religious",Pro-Choice,No,No,Behave no differently,5,1,4,Yes
3,Male,55,Independent,Moderate,High school or less,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,Yes,Yes,Less Willing,4,5,4,Yes
4,Male,64,Republican,Conservative,High school or less,White,Yes,Yes,Yes,No,Pro-life,No,No,Behave no differently,5,1,1,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,Male,21,Republican,Conservative,Some college,White,Yes,No,Yes,No,Pro-Choice,Yes,No,Less Willing,5,2,5,No
165,Female,41,Republican,Liberal,Graduate degree,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-Choice,Yes,No,Less Willing,5,2,2,No
166,Male,60,Republican,Conservative,Some college,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,No,Yes,Behave no differently,5,5,4,Yes
167,Female,51,Republican,Conservative,Graduate degree,White,Yes,Yes,Yes,"Yes, very religious",Pro-life,Yes,No,Less Willing,2,5,1,No


In [33]:
# Splitting the dataset into features (X) and target (y)
X = train_data_cleaned.drop('political_affiliation', axis=1)
y = train_data_cleaned['political_affiliation']

# Identifying categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])


Logistic regression

In [34]:
# Creating a Logistic Regression model
logreg_model = LogisticRegression(random_state=0, max_iter=1000)
logreg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('classifier', logreg_model)])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
param_grid = {'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(logreg_pipeline, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_logreg_model = grid_search.best_estimator_
y_pred_logreg = best_logreg_model.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print(f"Accuracy with Logistic Regression: {accuracy_logreg}")
test_data = pd.read_csv("/content/sample_data/CAH-201803-test.csv")
test_data_cleaned = test_data.drop(columns=['id_num']).dropna()
y_pred_logreg_test = best_logreg_model.predict(test_data_cleaned)
submission_logreg = pd.DataFrame()
submission_logreg['id_num'] = test_data['id_num']
submission_logreg['political_affiliation_predicted'] = y_pred_logreg_test
submission_logreg.head()


Accuracy with Logistic Regression: 0.6470588235294118


Unnamed: 0,id_num,political_affiliation_predicted
0,2,Republican
1,3,Independent
2,4,Independent
3,6,Republican
4,11,Independent
