In [1]:
import tensorflow as tf

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib 
from matplotlib import pyplot as plt
import sklearn
import nltk
import string

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold

In [3]:
# load training data
df = pd.read_csv('PMChallenge2018/challenge2-train.txt', names=['label', 'text'], sep='\t')
df['label'].value_counts()

0    9197
1     994
Name: label, dtype: int64

In [4]:
exclude = set(string.punctuation)
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

# tolower, remove punctuation, stem words, and tokenize
def preproc(text):
    text = text.lower()
    text = ''.join(ch for ch in text if ch not in exclude)
    text = ps.stem(text)
    word_tokens = word_tokenize(text)
    filtered_sentence = [ps.stem(w) for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_sentence)

In [5]:
# preprocess "text" column into "feature" column
df['feature'] = df['text'].apply(lambda x: preproc(x))
df.head(10)

Unnamed: 0,label,text,feature
0,0,Has Ben Carson started talking about hummus ye...,ben carson start talk hummu yet curiou find ha...
1,0,Christie: Obama created ISIS by not launching ...,christi obama creat isi launch war irangopdeb
2,1,"Trump is right, you can't fight everyone at th...",trump right cant fight everyon timethi dude ac...
3,0,#GOPDebate I'm somewhat amazed that I'm hearin...,gopdeb im somewhat amaz im hear someth resembl...
4,0,The ins and outs of the American Dream is unde...,in out american dream understood ricksantorum ...
5,0,So the #GOPDebate seems to be running on fear?...,gopdeb seem run fear got
6,0,Charlie sheen: did someone say winning? #GOPDe...,charli sheen someon say win gopdeb
7,0,I guess you might say Ted's mouth is on... Cru...,guess might say ted mouth cruz control gopdeb
8,0,"Since they lack central nervous systems, tuxed...",sinc lack central nervou system tuxedo cap rar...
9,1,"To be quite frank, Trump is right #GOPDebate",quit frank trump right gopdeb


In [7]:
# 2 fold cross validation
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.model_selection import KFold
from scipy import interp
# Run classifier with cross-validation and plot ROC curves
n = 2
cv = KFold(n_splits=n)
#classifier = sklearn.ensemble.GradientBoostingClassifier()
classifier = sklearn.linear_model.LogisticRegression()
#classifier = sklearn.svm.SVC(probability=True, kernel='linear')

config = tf.contrib.learn.RunConfig(tf_random_seed=42)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

i = 0
X = df['feature']
y = df['label']
for train, test in cv.split(X, y):
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=1000, min_df=2, stop_words='english', use_idf=False)
    Xv = vectorizer.fit_transform(X[train])
    print(Xv.shape)
    
    feature_cols = tf.contrib.learn.infer_real_valued_columns_from_input(Xv)
    dnn_clf = tf.contrib.learn.DNNClassifier(hidden_units=[800, 500], n_classes=2,
                                            feature_columns=feature_cols,config=config)
    tf.contrib.learn.SKCompat(dnn_clf)
    dnn_clf.fit(Xv, y[train], batch_size=50, steps=10000)
    y_pred = dnn_clf.predict(vectorizer.transform(X[test]))
    accuracy_score(y[test], y_pred['probabilities'])
    
    
#print ('Average AUC in {} folds cross valudation: {}'.format(n, np.average(aucs)))

(5095, 1000)
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a20ca6710>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': 42, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': None}
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompa

ValueError: setting an array element with a sequence.

In [7]:
# now train again on the entire set
vectorizer = TfidfVectorizer(max_df=0.5, max_features=1000, min_df=2, stop_words='english', use_idf=False)
m = classifier.fit(vectorizer.fit_transform(X), y)

In [8]:
# predict on the unlabeled set
df2 = pd.read_csv('PMChallenge2018/challenge2-unlabeledtestfile.txt', names=['label', 'text'], sep='\t')
df2['feature'] = df2['text'].apply(lambda x: preproc(x))
X2 = vectorizer.transform(df2['feature'])
df2['label'] = m.predict_proba(X2)[:,1]
df2.head(10)

Unnamed: 0,label,text,feature
0,0.832307,"It's TRUE how ppl cannot see it, they are all ...",true ppl see say thing trump said gopdeb trump...
1,0.030791,@JeffreyToobin please tell @wolfblitzer to KIL...,jeffreytoobin pleas tell wolfblitz kill microp...
2,0.022389,"If this debate was high school, I think I'd wa...",debat high school think id want ted cruz take ...
3,0.019604,War. Death. Fear. Killing. The only themes of ...,war death fear kill theme tonight gopdeb
4,0.052031,The Middle East hasn't been destabilized for t...,middl east hasnt destabil thousand year romeby...
5,0.026232,I like that the 80s gameshow buzzer on the GOP...,like 80 gameshow buzzer gop debat fit hope nex...
6,0.042035,Replace ISIS with ICEE. It'll be great. Everyo...,replac isi ice itll great everyon love trumpfo...
7,0.003068,#GOPDebate my favs tonight: Carly Fiorina and ...,gopdeb fav tonight carli fiorina rubioüòû rub...
8,0.054915,This is probably the sanest thing said about U...,probabl sanest thing said us foreign polici to...
9,0.039544,Who the fuck is this guy? #GOPDebate,fuck guy gopdeb


In [9]:
df2['label'].to_csv('PMChallenge2018/challenge2-score.txt', index=False)