# Spooky Author Classification - Kaggle Submission

In [1]:
import feature_engineering
from sklearn.feature_extraction.text import CountVectorizer
from topic_modeling import get_topic_probs, get_new_topic_probs
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import pandas as pd
import csv
import warnings
warnings.filterwarnings('ignore')

# Feature Engineering

In [5]:
train_data = feature_engineering.prepare_data('train.csv')
train_df = train_data.apply(lambda x: feature_engineering.add_features(x), axis=1)
train_df['vector_avg'] = train_df['vector_avg'] - train_df['vector_avg'].min()
train_df['FleischReadingEase'] = train_df['FleischReadingEase'] - train_df['FleischReadingEase'].min()

In [4]:
test_data = feature_engineering.prepare_data('test.csv', test=True)
test_df = test_data.apply(lambda x: feature_engineering.add_features(x), axis=1)
test_df.set_index('id', inplace=True)
test_df['vector_avg'] = test_df['vector_avg'] - test_df['vector_avg'].min()
test_df['FleischReadingEase'] = test_df['FleischReadingEase'] - test_df['FleischReadingEase'].min()

## Topic Modeling

In [6]:
train_topic_probs, lda_model = get_topic_probs(train_df)
train_topic_probs.index = train_df.index
test_topic_probs = get_new_topic_probs(test_df, lda_model)
test_topic_probs.index = test_df.index


## TFIDF/Count Vecotrizer

In [9]:
cv = CountVectorizer()
cv.fit(train_df.text)
cv_train = pd.DataFrame(cv.transform(train_df.text).toarray(), index=train_df.index)
cv_test = pd.DataFrame(cv.transform(test_df.text).toarray(), index=test_df.index)

## Modeling

In [10]:
y_train = train_df['author']
train_df.drop(['text', 'lemmas', 'entities', 'author'], axis=1, inplace=True)
X_train = pd.concat([train_df, train_topic_probs, cv_train], axis=1)

In [37]:
test_topic_probs.columns = ['topic_0', 'topic_1', 'topic_2', 'topic3']

In [39]:
test_df.drop(['text', 'lemmas', 'entities'], axis=1, inplace=True)
X_test = pd.concat([test_df, test_topic_probs, cv_test], axis=1)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
predictions = lr.predict_proba(X_test)

## Kaggle Submission File

In [46]:
with open('submission.csv', mode='w') as file:
    writer = csv.writer(file, delimiter=',')

    #way to write to csv file
    writer.writerow(['id', 'EAP', 'HPL', 'MWS'])
    for i in range(len(predictions)):
        _id = X_test.index[i]
        preds = predictions[i]
        row = [_id, preds[0], preds[1], preds[2]]
        writer.writerow(row)
    

In [2]:
import visualisations

In [3]:
df = pd.read_csv('train.csv')

In [4]:
corpus,lda_model = visualisations.create_pyLDAvis(df)

In [5]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)

In [6]:
vis