In [7]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV, cross_val_predict
from sklearn.metrics import mean_squared_error, accuracy_score

from xgboost import XGBClassifier

In [18]:
train = pd.read_csv(r"C:\Users\tuke-\Documents\GitHub\Final\Classification\data\CAH-201803-train.csv")

train = train.dropna()

test = pd.read_csv(r"C:\Users\tuke-\Documents\GitHub\Final\Classification\data\CAH-201803-test.csv")

test = test.dropna()


In [4]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object))
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

In [5]:
X = train.drop(['id_num','political_affiliation'], axis = 1)

y = train[['political_affiliation']]

print(y)

le =LabelEncoder()

le.fit(y)

list(le.classes_)

y = le.transform(y)

print(y)

    political_affiliation
0             Independent
1             Independent
2                Democrat
3             Independent
4              Republican
..                    ...
164            Republican
165            Republican
166            Republican
167            Republican
168            Republican

[169 rows x 1 columns]
[1 1 0 1 2 2 0 0 1 1 0 2 1 0 2 2 0 0 2 1 1 1 1 1 0 1 0 1 2 1 1 1 2 1 0 1 1
 0 2 2 1 2 1 1 2 2 0 1 0 1 1 2 2 0 2 0 0 2 0 1 1 0 1 1 1 0 1 0 1 2 1 0 2 2
 0 1 0 0 0 0 2 1 2 1 1 2 1 1 0 2 0 2 1 2 0 1 0 2 0 2 2 2 0 0 0 0 1 0 1 2 2
 0 1 1 0 0 0 0 2 0 0 2 1 2 1 2 1 2 1 0 2 0 0 1 0 2 1 2 0 0 0 0 1 1 0 2 1 0
 0 0 2 2 0 2 2 0 0 2 2 0 2 1 2 1 2 2 2 2 2]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [34]:
from xgboost import XGBClassifier


X_test, X_train, y_test, y_train = train_test_split(X,y, test_size = 0.25)

pipeline = Pipeline(
  [
   ("preprocessing", ct),
  ("Regression", XGBClassifier())]
)

fitted = pipeline.fit(X_train,y_train)

param_grid = {
    'Regression__n_estimators': [100, 200, 300, 400, 500],
    'Regression__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'Regression__max_depth': [3, 4, 5, 6, 7, 8],
    #'Regression__colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    #'Regression__gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5]
}


grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

grid_search.fit(X, y)

best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

best_score = -grid_search.best_score_
print(f"Best Accuracy score: {best_score}")

y_pred = grid_search.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print('Test accuracy: %f' % accuracy)

Best parameters: {'Regression__learning_rate': 0.05, 'Regression__max_depth': 3, 'Regression__n_estimators': 200}
Best Accuracy score: -0.5922794117647059
Test RMSE: 0.968254


In [8]:
X = train.drop(['id_num','political_affiliation'], axis = 1)

y = train['political_affiliation']

le = LabelEncoder()

le.fit(y)

list(le.classes_)

y = le.transform(y)

pipeline = Pipeline(
  [
   ("preprocessing", ct),
  ("Regression", XGBClassifier(learning_rate = 0.05, max_depth = 3, n_estimators = 200))]
)

fitted = pipeline.fit(X,y)

y = fitted.predict(test)

original_labels = le.inverse_transform(y)

final_predictions = pd.DataFrame(
    {"id_num": test["id_num"],
    "political_affiliation_predicted": original_labels}
)


print(final_predictions)

     id_num political_affiliation_predicted
0         2                      Republican
1         3                        Democrat
2         4                     Independent
3         6                     Independent
4        11                     Independent
..      ...                             ...
161     327                        Democrat
162     330                     Independent
163     331                      Republican
164     333                        Democrat
165     334                      Republican

[166 rows x 2 columns]


In [9]:
final_predictions.to_csv(r'C:\Users\tuke-\Desktop\final_predictions_political.csv', index=False)

In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = train.drop(['id_num','political_affiliation'], axis = 1)

y = train['political_affiliation']

le = LabelEncoder()

le.fit(y)

list(le.classes_)

y = le.transform(y)

X_test, X_train, y_test, y_train = train_test_split(X, y, test_size = 0.25)

pipeline = Pipeline(
  [
   ("preprocessing", ct),
   ("Regression", svm.SVC())
  ]
)

fitted = pipeline.fit(X_train, y_train)

param_grid = {
    'Regression__C': [0.1, 1, 10, 100, 1000],  
    'Regression__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
    'Regression__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'Regression__degree': [2, 3, 4, 5]
}


grid_search = GridSearchCV(pipeline, param_grid, cv=20, scoring='accuracy', n_jobs=-1)

grid_search.fit(X, y)

best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

best_score = -grid_search.best_score_
print(f"Best Accuracy score: {best_score}")

y_pred = grid_search.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print('Test accuracy: %f' % accuracy)


In [11]:
X = train.drop(['id_num','political_affiliation'], axis = 1)

y = train['political_affiliation']

le = LabelEncoder()

le.fit(y)

list(le.classes_)

y = le.transform(y)

pipeline = Pipeline(
  [
   ("preprocessing", ct),
  ("Regression", svm.SVC(C = 0.1, gamma = 1, kernel = 'linear'))]
)

fitted = pipeline.fit(X,y)

y = fitted.predict(test)

original_labels = le.inverse_transform(y)

final_predictions = pd.DataFrame(
    {"id_num": test["id_num"],
    "political_affiliation_predicted": original_labels}
)


print(final_predictions)

     id_num political_affiliation_predicted
0         2                      Republican
1         3                        Democrat
2         4                     Independent
3         6                      Republican
4        11                     Independent
..      ...                             ...
161     327                        Democrat
162     330                     Independent
163     331                        Democrat
164     333                        Democrat
165     334                      Republican

[166 rows x 2 columns]


In [12]:
final_predictions.to_csv(r'C:\Users\tuke-\Desktop\final_predictions_political.csv', index=False)

In [20]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

X = train.drop(['id_num','political_affiliation'], axis = 1)

y = train['political_affiliation']

le = LabelEncoder()

le.fit(y)

list(le.classes_)

y = le.transform(y)

X_test, X_train, y_test, y_train = train_test_split(X, y, test_size = 0.25)

pipeline = Pipeline(
  [
   ("preprocessing", ct),
   ("Regression", LDA())
  ]
)

fitted = pipeline.fit(X_train, y_train)

param_grid = {
    'Regression__solver': ['svd', 'lsqr', 'eigen'],  
    'Regression__shrinkage': [None, 'auto', 0.1, 0.2, 0.3], 
}

grid_search = GridSearchCV(pipeline, param_grid, cv=20, scoring='accuracy', n_jobs=-1)

grid_search.fit(X, y)

best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

best_score = -grid_search.best_score_
print(f"Best Accuracy score: {best_score}")

y_pred = grid_search.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print('Test accuracy: %f' % accuracy)


Best parameters: {'Regression__shrinkage': 'auto', 'Regression__solver': 'lsqr'}
Best Accuracy score: -0.625
Test accuracy: 0.722222


100 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\tuke-\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\tuke-\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\tuke-\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\tuke-\anaconda3\Lib\site-packages\sklearn\base.py", line

In [21]:
X = train.drop(['id_num','political_affiliation'], axis = 1)

y = train['political_affiliation']

le = LabelEncoder()

le.fit(y)

list(le.classes_)

y = le.transform(y)

pipeline = Pipeline(
  [
   ("preprocessing", ct),
  ("Regression", LDA(shrinkage = 'auto', solver = 'lsqr') )]
)

fitted = pipeline.fit(X,y)

y = fitted.predict(test)

original_labels = le.inverse_transform(y)

final_predictions = pd.DataFrame(
    {"id_num": test["id_num"],
    "political_affiliation_predicted": original_labels}
)


print(final_predictions)

     id_num political_affiliation_predicted
0         2                      Republican
1         3                        Democrat
2         4                     Independent
3         6                      Republican
4        11                     Independent
..      ...                             ...
161     327                        Democrat
162     330                     Independent
163     331                        Democrat
164     333                        Democrat
165     334                      Republican

[166 rows x 2 columns]


In [22]:
final_predictions.to_csv(r'C:\Users\tuke-\Desktop\final_predictions_political.csv', index=False)