In [None]:
#https://www.kaggle.com/c/nlp-243-f21-core-rel-pred/data?select=test_data.csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.utils import class_weight
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
import numpy as np
import warnings


corpus_entries = []
with open('train_data_merged_labels.csv', 'r') as corpus_csv:
  corpus_raw = pd.read_csv(corpus_csv)
  #print(corpus_raw.loc[0])

pipe = Pipeline([('count', CountVectorizer()),
                 ('tfidf', TfidfTransformer())]).fit(corpus_raw['utterances'])

x_vectorizer = CountVectorizer()
nrmlzr = Normalizer()
x = x_vectorizer.fit_transform(corpus_raw['utterances'])
#x = nrmlzr.transform(x)
#print(x.toarray())
x_tvectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2',
                                encoding='latin-1', ngram_range=(1,2), stop_words='english')
#x = x_tvectorizer.fit_transform(corpus_raw['utterances'])
#x = pipe.transform(corpus_raw['utterances'])

y_binarizer = LabelBinarizer()
y = y_binarizer.fit_transform(corpus_raw['Core Relations'])
#print(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=7)

scalar = StandardScaler(with_mean=False)
x_train_scaled = scalar.fit_transform(x_train)
x_test_scaled = scalar.transform(x_test)

#print(len(y_train[0]))
#oversample = RandomOverSampler(sampling_strategy='minority')
#x_train, y_train = oversample.fit_resample(x_train, y_train)
#print(len(y_train[0]))
param_grid = {'C': [0.1, 1, 10, 100, 1000]}

dtc = DecisionTreeClassifier(class_weight='balanced', max_depth=140)
dtc.fit(x_train_scaled, y_train)
y_test_pred = dtc.predict(x_test_scaled)

print(accuracy_score(y_test, y_test_pred))

lscv = LinearSVC(C=1.0)
grid = GridSearchCV(lscv, param_grid, refit=True, verbose=0)
ovr = OneVsRestClassifier(lscv)
with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  ovr.fit(x_train, y_train)
y_test_pred = ovr.predict(x_test)

print(accuracy_score(y_test, y_test_pred))

cnb = ComplementNB()
ovr2 = OneVsRestClassifier(cnb)
with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  ovr2.fit(x_train, y_train)
y_test_pred = ovr2.predict(x_test)

print(accuracy_score(y_test, y_test_pred))

test_raw = pd.read_csv('test_data.csv')
x_test_actual = x_vectorizer.transform(test_raw['utterances'])
x_test_actual = scalar.transform(x_test_actual)
y_actual_pred = ovr.predict(x_test_actual)

d = {'Id': range(0, len(y_actual_pred)), 'Predicted': y_binarizer.inverse_transform(y_actual_pred)}
df = pd.DataFrame(data=d)
df.to_csv('predictions.csv', index=False)



FileNotFoundError: ignored