In [174]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [175]:
def topk_by_date(date, score, k = 100):
    df ={'date':date, 'score':score}
    df = pd.DataFrame(df)
    topk = df.groupby('date').score.nlargest(k)
    return topk

In [176]:
df = pd.read_csv('reddit_worldnews_1year.csv')

In [177]:
train_valid_rate = 0.95
train_valid_point = int(train_valid_rate * len(df))
train_df = df.iloc[:train_valid_point, :]
valid_df = df.iloc[train_valid_point:, :]

In [178]:
df = train_df
date_list = pd.unique(df.date)
df.groupby('date').agg('count').head()

Unnamed: 0_level_0,Unnamed: 0,title,log_score
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-06-01,1231,1231,1231
2017-06-02,2469,2469,2469
2017-06-03,1616,1616,1616
2017-06-04,1688,1688,1688
2017-06-05,1928,1928,1928


In [180]:
df['score_label'] = np.zeros(len(df), dtype=np.int32)
top_score = []
for date in date_list:
    df_by_date = df[df['date'] == date]
    topk = int(0.1 * len(df_by_date))
    top_score.extend(df_by_date.log_score.nlargest(topk).index)

In [181]:
df['score_label'][top_score] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [200]:
df_1 = df[df['score_label']==1]
df_0 = df[df['score_label']==0]
df_0 = df_0.sample(frac=1.0, replace=True)
df_concat = pd.concat([df_0, df_1])
df_concat = df_concat.reset_index()
df_concat.head()

Unnamed: 0.1,level_0,index,Unnamed: 0,title,log_score,date,score_label
0,12997,147746,147746,malaysias embattled najib questioned by anti c...,0.477121,2018-05-22,0
1,5781,131121,131121,nato and washington worry about russian subs i...,1.579784,2018-04-27,0
2,10779,181992,181992,dna tests disprove womans claim that dal was h...,0.30103,2017-09-07,0
3,8466,182190,182190,members mark super premium ply paper towels ro...,0.30103,2017-09-07,0
4,48566,154799,154799,indonesian court sentences first travel execut...,0.778151,2018-05-31,0


In [201]:
df = df_concat.sample(frac=1).reset_index(drop=True)
train_df = df
df.head()

Unnamed: 0.1,level_0,index,Unnamed: 0,title,log_score,date,score_label
0,40792,252830,252830,curiosity rover finds its crater was habitable...,2.970812,2017-06-03,1
1,30401,103812,103812,austrian man gets passed over for promotion in...,2.491362,2018-03-21,1
2,4428,193221,193221,north korea may detonate hydrogen bomb in the ...,3.421768,2017-09-22,1
3,6866,59648,59648,palestinian family shoots dead relative suspec...,1.672098,2018-01-20,1
4,54653,234083,234083,bob barr quotes about technology,0.30103,2017-07-13,0


In [202]:
vectorizer = CountVectorizer(ngram_range=(1,1))
vectorizer = vectorizer.fit(train_df['title'])

In [203]:
trans_title_train = vectorizer.transform(train_df['title'])
trans_title_valid = vectorizer.transform(valid_df['title'])

In [204]:
model = LinearRegression()

In [205]:
model = model.fit(trans_title_train, train_df['log_score'])

In [206]:
prediction = model.predict(trans_title_valid)

In [207]:
def evaluate_by_top_score(date, score, pred):
    df = pd.DataFrame({'date':date, 'score':score, 'pred':pred})
    date_list = pd.unique(date)
    
    df['score_label'] = np.zeros(len(df), dtype=np.int32)
    df['pred_label'] = np.zeros(len(df), dtype=np.int32)
    top_score = []
    top_pred = []
    
    for day in date_list:
        df_by_date = df[df['date'] == day]
        topk = int(0.1 * len(df_by_date))
        top_score.extend(df_by_date.score.nlargest(topk).index)
        top_pred.extend(df_by_date.pred.nlargest(topk).index)
        
    df['score_label'][top_score] = 1
    df['pred_label'][top_pred] = 1
    cross = pd.crosstab(df['score_label'], df['pred_label'], rownames=["Actual"], colnames=["Predicted"])
    precision = cross[1][1] / (cross[1][1] + cross[0][1]) 
    print(cross)
    print()
    print("precision: {}".format(precision))
    
    return precision

#type(prediction)
valid_df['pred'] = prediction
valid_df

Unnamed: 0.1,Unnamed: 0,title,log_score,date,pred,score_label,pred_label
288798,288798,vanessa kerry quotes about truth,0.301030,2017-06-13,0.052585,0,0
288799,288799,the us takes syrias al qaeda off terror watch ...,0.301030,2017-06-13,1.416842,0,0
288800,288800,covfefe act would make social media a presiden...,0.698970,2017-06-13,-0.181352,0,0
288801,288801,hot at the best escort service in noida,0.301030,2017-06-13,-1.036932,0,0
288802,288802,grow taller height enhancement program,0.301030,2017-06-13,0.742604,0,0
288803,288803,munich shooting four hurt at suburban railway ...,0.301030,2017-06-13,1.053469,0,0
288804,288804,erdoan to discuss qatar crisis with us preside...,0.602060,2017-06-13,1.437304,0,0
288805,288805,isis calls for attacks in west russia middle e...,1.041393,2017-06-13,0.993917,0,0
288806,288806,indonesian police foil shipment of detonators,0.954243,2017-06-13,1.115654,0,0
288807,288807,australia immigration consultants in bangalore,0.301030,2017-06-13,0.235214,0,0


In [208]:
valid_df['score_label'] = np.zeros(len(valid_df), dtype=np.int32)
valid_df['pred_label'] = np.zeros(len(valid_df), dtype=np.int32)
date_list = pd.unique(valid_df.date)
valid_df.head()

Unnamed: 0.1,Unnamed: 0,title,log_score,date,pred,score_label,pred_label
288798,288798,vanessa kerry quotes about truth,0.30103,2017-06-13,0.052585,0,0
288799,288799,the us takes syrias al qaeda off terror watch ...,0.30103,2017-06-13,1.416842,0,0
288800,288800,covfefe act would make social media a presiden...,0.69897,2017-06-13,-0.181352,0,0
288801,288801,hot at the best escort service in noida,0.30103,2017-06-13,-1.036932,0,0
288802,288802,grow taller height enhancement program,0.30103,2017-06-13,0.742604,0,0


In [209]:
top_score = []
top_pred = []
for date in date_list:
    df_by_date = valid_df[valid_df['date'] == date]
    topk = int(0.1 * len(df_by_date))
    top_score.extend(df_by_date.log_score.nlargest(topk).index)
    top_pred.extend(df_by_date.pred.nlargest(topk).index)
valid_df['score_label'][top_score] = 1
valid_df['pred_label'][top_pred] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [210]:
pd.crosstab(valid_df['score_label'], valid_df['pred_label'], rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,12411,1276
1,1276,237
