In [32]:
%matplotlib inline
import pandas as pd
import datetime
from sklearn import tree
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pysentiment
import pymysql
import os
import csv
from textblob import TextBlob


def get_remote_db_context():
    return pymysql.connect(user='admin', password='tr001', host='ECSC00104617.epam.com', database='tr_news_max_challenge')

def add_field(df, field, new_field, transformer):
    df[new_field] = df[field].apply(transformer)

def add_field_length(df, field, new_field, sep=None): 
    count = lambda val: len(val.split(sep) if sep else val) if not pd.isnull(val) else 0
    add_field(df, field, new_field, count)


def orig_data(table):
    df = None
    try:
        df = pd.read_pickle('D:\\thomson_reuters\\data_sets\\sql_cache_{0}.pkl'.format(table))
    except IOError:
        cnx =  get_remote_db_context()
        query = '''
            select _guid, orgs, body, topics, title
            from {0};
        '''.format(table)
        df = pd.read_sql(query, cnx)

        add_field_length(df, 'orgs', 'org_count', ' ')
        add_field_length(df, 'body', 'story_word_count', ' ')
        add_field_length(df, 'topics', 'topics_count', ' ')
        add_field_length(df, 'title', 'title_word_count', ' ')

        df.to_pickle('D:\\thomson_reuters\\data_sets\\sql_cache_{0}.pkl'.format(table))
        
    return df

def get_df(file_name, sep, key):
    data_folder = 'd:\\thomson_reuters\\data_sets\\'
    path = os.path.join(data_folder, file_name)
    df = pd.read_csv(path, sep=sep).rename(columns={key: '_guid'})
    return df


def add_sentiment(df):

    df['body_front'] = df['body'].apply(lambda s: s[:500])
    
#     lm = pysentiment.LM()
#     for sentiment in ('Polarity', 'Positive', 'Negative', 'Subjectivity'):
#         py_senti = lambda s: 0 if pd.isnull(s) else lm.get_score(lm.tokenize(s))[sentiment]
#         df['py_title_' + sentiment] = df['title'].apply(py_senti)
#         df['py_body_' + sentiment] = df['body_front'].apply(py_senti)

#     tb_polarity_senti = lambda s: TextBlob(s).sentiment[0]
#     tb_subjectivity_senti = lambda s: TextBlob(s).sentiment[1]

#     df['tb_body_polarity'] = df['body_front'].apply(tb_polarity_senti)
#     df['tb_body_subjectivity'] = df['body_front'].apply(tb_subjectivity_senti)
#     df['tb_title_polarity'] = df['title'].apply(tb_polarity_senti)
#     df['tb_title_subjectivity'] = df['title'].apply(tb_subjectivity_senti)
    
    sia = SentimentIntensityAnalyzer()
    
    for sentiment in ('compound', 'neg', 'neu', 'pos'):
        nltk_senti = lambda s: sia.polarity_scores(s)[sentiment]
        df['sentiment_' + sentiment] = df['body_front'].apply(nltk_senti)

    df.drop('body_front', axis=1, inplace=True)
    
def add_features(df):

        add_field(df, 'significance', 'is_significant', lambda x: 0 if x == 0 else 1)
        add_field(df, 'significance', 'is_very_significant', lambda x: 1 if x == 2 else 0)
        add_field(df, 'date.time', 'hour', lambda x: datetime.datetime.fromtimestamp(int(x)).hour)
        add_sentiment(df)


    
def get_working_df(df_type, refresh=False):
    if refresh:
        df_orig_data = orig_data('labeled_{0}'.format(df_type))
        df_orig_data.drop('topics', axis=1, inplace=True)
        df_tokens = get_df('token_tfidf_{0}.csv'.format(df_type), '\t', '_guid')
        df_tokens.drop('title', axis=1, inplace=True)
        df_bgrams = get_df('bi_gram_tfidf_{0}.csv'.format(df_type), ',', '_guid')
        df_bgrams.drop('source.significance', axis=1, inplace=True)        
        df_topics = get_df('topics_tfidf_{0}.csv'.format(df_type), ',', 'id')
        df_topics.drop('topics', axis=1, inplace=True)
        df = pd.merge(df_tokens, df_orig_data, on='_guid', how='inner')
        df = pd.merge(df, df_bgrams, on='_guid', how='inner')
        df = pd.merge(df, df_topics, on='_guid', how='inner')
        add_features(df)
        df.drop(['_data_source','_id', 'title', 'Unnamed: 0','source.significance','orgs', 'body',
                 'date.time','journal.code','organizations','persons'], axis=1, inplace=True)
#         df.drop('_guid', axis=1, inplace=True)
        df = df[list(set(df.columns))]
        df.to_pickle('D:\\thomson_reuters\\data_sets\\working_dataframe_{0}.pkl'.format(df_type))
        return df
    else:
        return pd.read_pickle('D:\\thomson_reuters\\data_sets\\working_dataframe_{0}.pkl'.format(df_type))
  

In [33]:
df_train = get_working_df('train', True)
df_test = get_working_df('test', True)



In [35]:
columns = ['_guid', 'sentiment_pos', 'sentiment_neg', 'sentiment_neu', 'sentiment_compound', 'story_word_count', 'topics_count', 'title_word_count', 'org_count']

df_test[columns].to_csv('add_features_test.csv', sep=',', quoting=csv.QUOTE_ALL)
df_train[columns].to_csv('add_features_train.csv', sep=',', quoting=csv.QUOTE_ALL)

In [36]:
df_test[columns].head()

Unnamed: 0,_guid,sentiment_pos,sentiment_neg,sentiment_neu,sentiment_compound,story_word_count,topics_count,title_word_count,org_count
0,I003a0090e9d311e49ce48b4c25b70f44,0.158,0.0,0.842,0.8779,336,1,10,0
1,I0064c650f00311e48ec8ed0cb568cff3,0.131,0.039,0.831,0.7579,647,2,2,0
2,I008ce4f0f04911e490899ddc8face275,0.035,0.04,0.926,-0.128,462,2,3,0
3,I009bc470a3b311e5ac01b95a1193ecad,0.104,0.029,0.867,0.6249,1894,1,5,0
4,I00d18ee0f02411e4bf7bbb192cc48d22,0.159,0.0,0.841,0.9001,616,1,15,0


In [4]:
from __future__ import division
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
import pickle


df_main = get_working_df('train')

sig_tokens = set([t for t, c in pickle.load(open('data\\token_freqs.pkl', mode='r'))])
sig_bgs = set([bg for bg, c in pickle.load(open('data\\bi_gram_freqs.pkl', mode='r'))])
other_features = set(['nltk_body_compound', 'nltk_body_neg', 'nltk_body_neu', 'nltk_body_pos', 
                      'org_count','story_word_count','topics_count', 'title_word_count', 'hour'])

sig_features = sig_tokens | sig_bgs | other_features


y_cols = ['significance', 'is_significant', 'is_very_significant']
x_cols = [c for c in df_main.columns if c not in y_cols and c in sig_features]




classifiers = [
    GradientBoostingClassifier(),
    LogisticRegression(),
    DecisionTreeClassifier(criterion='entropy', max_depth=8),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    KNeighborsClassifier(n_neighbors=3, metric='cosine', algorithm='brute'),
    SVC(),
    LinearSVC(loss='hinge'),
]





def tt_split(df):
    msk = np.random.rand(len(df)) < 0.8
    df_test = df[~msk]
    df_train = df[msk]

    return df_train, df_test
    

def test_classifier(clf, df_train, df_test, x_cols, y_col, average_type):
    X_test, y_test = df_test[x_cols], df_test[y_col]
    X_train, y_train = df_train[x_cols], df_train[y_col]
    model = clf.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score(y_pred, y_test, average_type)
    return model, y_pred, y_test

def score(y_pred, y_test, average_type):
    score = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average=average_type)
    print '\t\tAccuracy: ', score, '\t', 'F1: ', f1


for clf in classifiers:

    df = df_main[x_cols + y_cols].dropna()

    df_train, df_test = tt_split(df)
    
    N = len(df_train)
    
    df = None

    model_name = str(clf).split('(')[0]

    print model_name
    print '\tis_significant'
    print 'N = ', N
    
    sig_model, y_pred_s, y_test_s = test_classifier(clf, df_train, df_test, x_cols, 'is_significant', 'binary')

    def sig_score(scores):
        pred_s, real = scores
        if pred_s == 0:
            if real == 0:
                return 10
            elif real == 1:
                return 3
            else:
                return -3
        else:
            return None
            
    ns_tr_score = [s for s in map(sig_score, zip(y_pred_s, df_test['significance'])) if s]

    y_train_pred_s = sig_model.predict(df_train[x_cols])
    
    pred_train_s_mask = map(lambda x: x == 1, y_train_pred_s)
    pred_test_s_mask = map(lambda x: x == 1, y_pred_s)
    
    df_sig_train = df_train[pred_train_s_mask]
    df_sig_test = df_test[pred_test_s_mask]

    print '\tis_very_significant'
    print 'N = ', len(df_sig_train)      

    vsig_model, y_pred_vs, y_test_vs = test_classifier(clf, df_sig_train, df_sig_test, x_cols, 'is_very_significant', 'binary')

    def very_sig_score(scores):
        pred_vs, real = scores
        if pred_vs == 1:
            if real == 2:
                return 10
            elif real == 1:
                return 3
            else:
                return -3
        else:
            if real == 1:
                return 10
            else:
                return 3            
    
    s_tr_score = map(very_sig_score ,zip(y_pred_vs, df_sig_test['significance']))

    print 'TR score', (sum(ns_tr_score) + sum(s_tr_score)) / (len(ns_tr_score) + len(s_tr_score))
    
    if model_name != 'LogisticRegression':
        print '\tsignificant_cats'
        cat_model, y_pred, y_test = test_classifier(clf, df_train, df_test, x_cols, 'significance', 'weighted')

    clf = None
#y_cat_pred = pred[0].combine(pred[1], lambda s, vs: 0 if s == 0 else (2 if vs == 1 else 1))

GradientBoostingClassifier
	is_significant
N =  8081
		Accuracy:  0.898572131955 	F1:  0.92832289492
	is_very_significant
N =  5872
		Accuracy:  0.804158283032 	F1:  0.754208754209
TR score 8.27474150665
	significant_cats
		Accuracy:  0.774495322501 	F1:  0.775528657199
LogisticRegression
	is_significant
N =  8037
		Accuracy:  0.88578313253 	F1:  0.915688367129
	is_very_significant
N =  5477
		Accuracy:  0.795486600846 	F1:  0.77093206951
TR score 8.16192771084
DecisionTreeClassifier
	is_significant
N =  8004
		Accuracy:  0.828747628083 	F1:  0.881289049655
	is_very_significant
N =  6187
		Accuracy:  0.79704797048 	F1:  0.746932515337
TR score 7.65891840607
	significant_cats
		Accuracy:  0.679316888046 	F1:  0.683377243491
RandomForestClassifier
	is_significant
N =  8097
		Accuracy:  0.888337468983 	F1:  0.918743228602
	is_very_significant
N =  5399
		Accuracy:  0.795325779037 	F1:  0.75611814346
TR score 8.17320099256
	significant_cats
		Accuracy:  0.754838709677 	F1:  0.755260544497


In [78]:
sum(list([1,2,43,4]))

50

In [77]:
s1 = pd.Series([-1 for _ in xrange(5)])
s2 = pd.Series([1,2,3,2,5])

s1 = (s1 + s2).apply(lambda x: x)
s1

0    0
1    1
2    2
3    1
4    4
dtype: int64

In [1]:
import pandas as pd

df_train = pd.read_csv('D:\\thomson_reuters\\data_sets\\token_tfidf_training.csv', sep='\t')
df_test = pd.read_csv('D:\\thomson_reuters\\data_sets\\token_tfidf_testing.csv', sep='\t')

In [5]:
set(df_train.columns) - set(df_test.columns)

{'announcements_', 'fortnight', 'nta', 'steepest', 'stoc', 'summat', 'vwp'}

In [8]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pysentiment
from textblob import TextBlob

def add_sentiment(df):

    df['body_front'] = df['body'].apply(lambda s: s[:200])
    
    for sentiment in ('Polarity', 'Positive', 'Negative', 'Subjectivity'):
        py_senti = lambda s: 0 if pd.isnull(s) else lm.get_score(lm.tokenize(s))[sentiment]
        df['py_title_' + sentiment] = df['title'].apply(py_senti)
        df['py_body_' + sentiment] = df['body_front'].apply(py_senti)

    tb_polarity_senti = lambda s: TextBlob(s).sentiment[0]
    tb_subjectivity_senti = lambda s: TextBlob(s).sentiment[1]

    df['tb_body_polarity'] = df['body_front'].apply(tb_polarity_senti)
    df['tb_body_subjectivity'] = df['body_front'].apply(tb_subjectivity_senti)
    df['tb_title_polarity'] = df['title'].apply(tb_polarity_senti)
    df['tb_title_subjectivity'] = df['title'].apply(tb_subjectivity_senti)
    
    sia = SentimentIntensityAnalyzer()
    
    for sentiment in ('compound', 'neg', 'neu', 'pos'):
        nltk_senti = lambda s: sia.polarity_scores(s)[sentiment]
        df['nltk_title_' + sentiment] = df['title'].apply(nltk_senti)
        df['nltk_body_' + sentiment] = df['body_front'].apply(nltk_senti)

0.39166666666666666

In [60]:
dt = lambda x: datetime.datetime.fromtimestamp(int(x)).hour

dt('1430439473')

20