In [None]:
import pandas as pd
import numpy as np

import re
from tqdm import tqdm
from nltk.corpus import stopwords
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

train_data.info()

test_data.info()

train_data.dropna().head()

train_data['target'].value_counts()

null_values = train_data.isnull().sum()
for index in range(len(train_data.columns)):
    if null_values[index] > 0:
        print('{:.2f}% ({}) Null Values Present in "{}" Feature'.format(null_values[index]/len(train_data)*100,
                                                              null_values[index], train_data.columns[index]))

train_data['keyword'].fillna(method = 'backfill', inplace = True)
train_data['keyword'].fillna(method = 'ffill', inplace = True)
test_data['keyword'].fillna(method = 'backfill', inplace = True)
test_data['keyword'].fillna(method = 'ffill', inplace = True)

test_data['keyword'].value_counts()

def decontration(text):
    text = re.sub(r"aren't", 'are not', text)
    text = re.sub(r"won't", 'will not', text)
    text = re.sub(r"doesn't", 'does not', text)
    
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text.lower()

def cleaning_text(text):
    text = re.sub(r'http\S+', ' ', text)
    text = decontration(text)
    text  = re.sub('[^A-Za-z,0123]+', ' ', text)
    
    stop_words = set(stopwords.words('english'))
    processed_list = [word for word in text.split() if word not in stop_words and len(word) > 2]
    return " ".join(processed_list)

preprocessed_text = []
for text in tqdm(train_data['text']):
    preprocessed_text.append(cleaning_text(text))
train_data['text'] = preprocessed_text

preprocessed_text = []
for text in tqdm(test_data['text']):
    preprocessed_text.append(cleaning_text(text))
test_data['text'] = preprocessed_text

train_data['keyword']

X = train_data[['keyword', 'text']]
y = train_data['target']
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

X_train

vectorizer = CountVectorizer(min_df=3)
X_train_text = vectorizer.fit_transform(X_train['text'])
X_test_text = vectorizer.transform(X_test['text'])
test_text = vectorizer.transform(test_data['text']) 

print("Vectorized Training Text Data Shape    : ", X_train_text.shape)
print("Vectorized Testing Text Data Shape     : ", X_test_text.shape)
print("Vectorized Real Testing Text Shape     : ", test_text.shape)



vectorizer = CountVectorizer()
X_train_keyword = vectorizer.fit_transform(X_train['keyword'])
X_test_keyword = vectorizer.transform(X_test['keyword']) 
test_keyword = vectorizer.transform(test_data['keyword']) 

print("Vectorized Training Data Shape    : ", X_train_keyword.shape)
print("Vectorized Testing Data Shape     : ", X_test_keyword.shape)
print("Vectorized Testing Data Shape     : ", test_keyword.shape)

print(X_train_keyword)

X_train_final = np.hstack((X_train_text.toarray(), X_train_keyword.toarray()))
X_test_final = np.hstack((X_test_text.toarray(), X_test_keyword.toarray()))
testing_data = np.hstack((test_text.toarray(), test_keyword.toarray()))

X_test_final.shape

parameters = {'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
sgd_clf = SGDClassifier(class_weight='balanced', penalty='l2', loss='log', random_state=910)
clf = GridSearchCV(sgd_clf, parameters, n_jobs = -1, cv = 5, scoring = make_scorer(f1_score))
clf.fit(X_train_final, y_train)
clf.best_params_

sgd_clf = SGDClassifier(alpha = 0.001, class_weight='balanced', penalty='l2', loss='log', random_state=910)
sgd_clf.fit(X_train_final, y_train)

train_preds = sgd_clf.predict(X_train_final)
test_preds = sgd_clf.predict(X_test_final)

print("Train Score ", f1_score(y_train, train_preds))
print('Test Score ', f1_score(y_test, test_preds))

submission_file = pd.DataFrame({'id':test_data['id'], 'target':sgd_clf.predict(testing_data)})
submission_file.to_csv("submission_file.csv", index=False)

