In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [100]:
def topk_by_date(date, score, k = 100):
    df ={'date':date, 'score':score}
    df = pd.DataFrame(df)
    topk = df.groupby('date').score.nlargest(k)
    return topk

In [101]:
df = pd.read_csv('reddit_worldnews_1year.csv')
#df = df.sort_values(by=['date'])
df.head()

Unnamed: 0.1,Unnamed: 0,title,log_score,date
0,0,healthy skin for the new year,0.30103,2017-11-01
1,1,president obama wants to give young leaders ar...,0.778151,2017-11-01
2,2,new york terror attack truck crash in lower ma...,0.30103,2017-11-01
3,3,eu japan ask un to condemn n korea over rights...,0.90309,2017-11-01
4,4,sell my miami house fast real estate solutions,0.30103,2017-11-01


In [102]:
train_df = df[df.date<'2018-05-01']
valid_df = df[df.date>='2018-05-01']
valid_df.head()
#train_valid_rate = 0.95
#train_valid_point = int(train_valid_rate * len(df))
#train_df = df.iloc[:train_valid_point, :]
#valid_df = df.iloc[train_valid_point:, :]

Unnamed: 0.1,Unnamed: 0,title,log_score,date
133364,133364,netanyahu to address country with dramatic new...,1.342423,2018-05-01
133365,133365,israeli prime minister benjamin netanyahu will...,0.0,2018-05-01
133366,133366,south korean leader says trump can take the nobel,0.0,2018-05-01
133367,133367,south korean loudspeakers silenced for good as...,1.230449,2018-05-01
133368,133368,trump suggests meeting kim jong un at koreas p...,0.477121,2018-05-01


In [103]:
df = train_df
date_list = pd.unique(df.date)
df.groupby('date').agg('count').head()

Unnamed: 0_level_0,Unnamed: 0,title,log_score
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-06-01,1231,1231,1231
2017-06-02,2469,2469,2469
2017-06-03,1616,1616,1616
2017-06-04,1688,1688,1688
2017-06-05,1928,1928,1928


In [104]:
df['score_label'] = np.zeros(len(df), dtype=np.int32)
top_score = []
for date in date_list:
    df_by_date = df[df['date'] == date]
    topk = int(0.1 * len(df_by_date))
    top_score.extend(df_by_date.log_score.nlargest(topk).index)

In [105]:
df['score_label'][top_score] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [106]:
df_1 = df[df['score_label']==1]
df_0 = df[df['score_label']==0]
df_0 = df_0.sample(frac=1.0, replace=True)
df_concat = pd.concat([df_0, df_1])
df_concat = df_concat.reset_index()
df_concat.head()

Unnamed: 0.1,index,Unnamed: 0,title,log_score,date,score_label
0,175229,175229,brooke shields quotes about mom,0.30103,2017-10-28,0
1,50004,50004,miley cyrus paid rent for a former voice conte...,0.30103,2018-01-07,0
2,296595,296595,is uber now possible after ceos departure cnet,0.30103,2017-06-22,0
3,179512,179512,really like football you have got to read thro...,0.30103,2017-09-03,0
4,257385,257385,contact trident legal service for alopecia aft...,0.30103,2017-06-08,0


In [107]:
df = df_concat.sample(frac=1).reset_index(drop=True)
train_df = df
df.head()

Unnamed: 0.1,index,Unnamed: 0,title,log_score,date,score_label
0,210041,210041,places to go while dating in vermont,0.30103,2017-08-13,0
1,87478,87478,dalton high school teacher uses a gun reckless...,0.0,2018-03-01,0
2,63489,63489,more young people needed as magistrates offici...,0.30103,2018-01-25,0
3,84412,84412,indian army chief pakistan china behind influx...,1.041393,2018-02-24,0
4,59731,59731,teen accessed top us security officials emails...,0.30103,2018-01-20,0


In [108]:
vectorizer = CountVectorizer(ngram_range=(1,1))
vectorizer = vectorizer.fit(train_df['title'])

In [109]:
trans_title_train = vectorizer.transform(train_df['title'])
trans_title_valid = vectorizer.transform(valid_df['title'])

In [110]:
model = LinearRegression()

In [111]:
model = model.fit(trans_title_train, train_df['log_score'])

In [112]:
prediction = model.predict(trans_title_valid)

In [113]:
valid_df.head()

Unnamed: 0.1,Unnamed: 0,title,log_score,date
133364,133364,netanyahu to address country with dramatic new...,1.342423,2018-05-01
133365,133365,israeli prime minister benjamin netanyahu will...,0.0,2018-05-01
133366,133366,south korean leader says trump can take the nobel,0.0,2018-05-01
133367,133367,south korean loudspeakers silenced for good as...,1.230449,2018-05-01
133368,133368,trump suggests meeting kim jong un at koreas p...,0.477121,2018-05-01


In [114]:
def evaluate_by_top_score(date, score, pred):
    df = pd.DataFrame({'date':date, 'score':score, 'pred':pred})
    date_list = pd.unique(date)
    
    df['score_label'] = np.zeros(len(df), dtype=np.int32)
    df['pred_label'] = np.zeros(len(df), dtype=np.int32)
    top_score = []
    top_pred = []
    
    for day in date_list:
        df_by_date = df[df['date'] == day]
        topk = int(0.1 * len(df_by_date))
        top_score.extend(df_by_date.score.nlargest(topk).index)
        top_pred.extend(df_by_date.pred.nlargest(topk).index)
        
    df['score_label'][top_score] = 1
    df['pred_label'][top_pred] = 1
    cross = pd.crosstab(df['score_label'], df['pred_label'], rownames=["Actual"], colnames=["Predicted"])
    precision = cross[1][1] / (cross[1][1] + cross[0][1]) 
    print(cross)
    print()
    print("precision: {}".format(precision))
    
    return precision

evaluate_by_top_score(valid_df.date, valid_df.log_score, prediction)

Predicted      0     1
Actual                
0          17954  1691
1           1691   476

precision: 0.21965851407475773


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.21965851407475773