# Data Loading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DATA_PATH_Train = '/content/drive/My Drive/Colab Notebooks/ML_Proj/Data/featured_train.csv'
DATA_PATH_Test = '/content/drive/My Drive/Colab Notebooks/ML_Proj/Data/featured_test.csv'
DATA_PATH_DEV = '/content/drive/My Drive/Colab Notebooks/ML_Proj/Data/featured_dev.csv'

In [None]:
import pandas as pd

train = pd.read_csv(DATA_PATH_Train)
test = pd.read_csv(DATA_PATH_Test)
dev = pd.read_csv(DATA_PATH_DEV)

In [None]:
train = train.drop(columns=["Unnamed: 0"])
test = test.drop(columns=["Unnamed: 0"])
dev = dev.drop(columns=["Unnamed: 0"])

In [None]:
X_train = train.iloc[:, 7:]
y_train = train['class'].astype(int)

X_test = test.iloc[:, 7:]
y_test = test['class'].astype(int)

X_dev = dev.iloc[:, 7:]
y_dev = dev['class'].astype(int)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(X_dev.shape)

(19826, 10049)
(2479, 10049)
(2478, 10049)


# Preprocessing

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV, PredefinedSplit
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from scipy import sparse
import matplotlib.pyplot as plt
import numpy as np
import seaborn
%matplotlib inline

## Feature Selection

In [None]:
selector = VarianceThreshold(0.00001)
x_train = selector.fit_transform(X_train)
print(x_train.shape)

ind = [ i for i, f in enumerate(selector.get_support()) if f ]
x_dev = X_dev.iloc[:, ind]
x_test = X_test.iloc[:, ind]
print(x_dev.shape)
print(x_test.shape)

(19826, 8959)
(2478, 8959)
(2479, 8959)


## Insert Development Dataset for validation

In [None]:
split_index = [-1]*x_train.shape[0] + [0]*x_dev.shape[0]
X = np.concatenate((x_train, x_dev), axis=0)
y = np.concatenate((y_train, y_dev), axis=0)
pds = PredefinedSplit(test_fold = split_index)

# Running the Model

## Baseline

In [None]:
pipe = Pipeline(
        [('select', SelectFromModel(SGDClassifier())),
        ('model', SGDClassifier())])

print(pipe.get_params().keys())

parameters = {'select__estimator__loss': ('log', 'hinge'),
              'model__loss': ('log', 'hinge'),
              'select__estimator__penalty': ('l1', 'l2'),
              'model__penalty': ('l1', 'l2'),
              'model__alpha': [0.0001, 0.001, 0.01, 1, 10, 100],
              'model__learning_rate':("optimal","constant","adaptive"),
              'model__eta0': [0.0001,0.001,0.01,0.1]
              }

grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=parameters,
                           cv=pds, 
                           verbose=2)

X = sparse.csr_matrix(X)
model = grid_search.fit(X, y)

In [None]:
y_preds1 = model.predict(x_test)
print(model.best_params_)

{'model__alpha': 0.0001, 'model__eta0': 0.0001, 'model__learning_rate': 'optimal', 'model__loss': 'log', 'model__penalty': 'l2', 'select__estimator__loss': 'hinge', 'select__estimator__penalty': 'l2'}


  f"X has feature names, but {self.__class__.__name__} was fitted without"


## Class Reweight

In [None]:
pipe = Pipeline(
        [('select', SelectFromModel(SGDClassifier(class_weight='balanced'))),
        ('model', SGDClassifier(class_weight='balanced'))])

print(pipe.get_params().keys())

parameters = {'select__estimator__loss': ('log', 'hinge'),
              'model__loss': ('log', 'hinge'),
              'select__estimator__penalty': ('l1', 'l2'),
              'model__penalty': ('l1', 'l2'),
              'model__alpha': [0.0001, 0.001, 0.01, 1, 10, 100],
              'model__learning_rate':("optimal","constant","adaptive"),
              'model__eta0': [0.0001,0.001,0.01,0.1]
              }

grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=parameters,
                           cv=pds, 
                           verbose=2)

X = sparse.csr_matrix(X)
model = grid_search.fit(X, y)

y_preds2 = model.predict(x_test)
print(model.best_params_)

## SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train_smote, y_train_smote = oversample.fit_resample(X_train, y_train)

In [None]:
split_index = [-1]*X_train_smote.shape[0] + [0]*X_dev.shape[0]
X = np.concatenate((X_train_smote, X_dev), axis=0)
y = np.concatenate((y_train_smote, y_dev), axis=0)
pds = PredefinedSplit(test_fold = split_index)

In [None]:
print(X.shape)

(48357, 10049)


In [None]:
pipe = Pipeline(
        [('select', SelectFromModel(SGDClassifier(class_weight='balanced'))),
        ('model', SGDClassifier(class_weight='balanced'))])

parameters = {'select__estimator__loss': ('log', 'hinge'),
              'model__loss': ('log', 'hinge'),
              'select__estimator__penalty': ('l1', 'l2'),
              'model__penalty': ('l1', 'l2'),
              'model__alpha': [0.0001, 0.001, 0.01, 1,10,100],
              'model__learning_rate':("optimal","constant","adaptive"),
              'model__eta0': [0.0001,0.001,0.01,0.1]
              }
              
grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=parameters,
                           cv=pds, 
                           verbose=2)

X = sparse.csr_matrix(X)
model = grid_search.fit(X, y)

y_preds = model.predict(x_test)

In [None]:
print(model.best_params_)

{'model__alpha': 0.0001, 'model__eta0': 0.0001, 'model__learning_rate': 'optimal', 'model__loss': 'hinge', 'model__penalty': 'l2', 'select__estimator__loss': 'hinge', 'select__estimator__penalty': 'l2'}


In [None]:
y_preds3 = model.predict(X_test)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


# Model Evaluation

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix

In [None]:
report = classification_report( y_test, y_preds1 )
print(report)

confusion_matrix = confusion_matrix(y_test,y_preds)
matrix_proportions = np.zeros((3,3))
for i in range(0,3):
    matrix_proportions[i,:] = confusion_matrix[i,:]/float(confusion_matrix[i,:].sum())
names=['Hate','Offensive','Neither']
confusion_df = pd.DataFrame(matrix_proportions, index=names,columns=names)
plt.figure(figsize=(5,5))
seaborn.heatmap(confusion_df,annot=True,annot_kws={"size": 12},cmap='gist_gray_r',cbar=False, square=True,fmt='.2f')
plt.ylabel(r'True categories',fontsize=14)
plt.xlabel(r'Predicted categories',fontsize=14)
plt.tick_params(labelsize=12)

In [None]:
report = classification_report( y_test, y_preds2 )
print(report)

confusion_matrix = confusion_matrix(y_test,y_preds)
matrix_proportions = np.zeros((3,3))
for i in range(0,3):
    matrix_proportions[i,:] = confusion_matrix[i,:]/float(confusion_matrix[i,:].sum())
names=['Hate','Offensive','Neither']
confusion_df = pd.DataFrame(matrix_proportions, index=names,columns=names)
plt.figure(figsize=(5,5))
seaborn.heatmap(confusion_df,annot=True,annot_kws={"size": 12},cmap='gist_gray_r',cbar=False, square=True,fmt='.2f')
plt.ylabel(r'True categories',fontsize=14)
plt.xlabel(r'Predicted categories',fontsize=14)
plt.tick_params(labelsize=12)

In [None]:
report = classification_report( y_test, y_preds3 )
print(report)

confusion_matrix = confusion_matrix(y_test,y_preds)
matrix_proportions = np.zeros((3,3))
for i in range(0,3):
    matrix_proportions[i,:] = confusion_matrix[i,:]/float(confusion_matrix[i,:].sum())
names=['Hate','Offensive','Neither']
confusion_df = pd.DataFrame(matrix_proportions, index=names,columns=names)
plt.figure(figsize=(5,5))
seaborn.heatmap(confusion_df,annot=True,annot_kws={"size": 12},cmap='gist_gray_r',cbar=False, square=True,fmt='.2f')
plt.ylabel(r'True categories',fontsize=14)
plt.xlabel(r'Predicted categories',fontsize=14)
plt.tick_params(labelsize=12)