In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [3]:
def topk_by_date(date, score, k = 100):
    df ={'date':date, 'score':score}
    df = pd.DataFrame(df)
    topk = df.groupby('date').score.nlargest(k)
    return topk

In [4]:
df = pd.read_csv('reddit_worldnews_1year.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,log_score,date
0,0,healthy skin for the new year,0.30103,2017-11-01
1,1,president obama wants to give young leaders ar...,0.778151,2017-11-01
2,2,new york terror attack truck crash in lower ma...,0.30103,2017-11-01
3,3,eu japan ask un to condemn n korea over rights...,0.90309,2017-11-01
4,4,sell my miami house fast real estate solutions,0.30103,2017-11-01


In [6]:
train_valid_rate = 0.95
train_valid_point = int(train_valid_rate * len(df))
train_df = df.iloc[:train_valid_point, :]
valid_df = df.iloc[train_valid_point:, :]

In [7]:
vectorizer = CountVectorizer(ngram_range=(1,1))
vectorizer = vectorizer.fit(train_df['title'])

In [8]:
trans_title_train = vectorizer.transform(train_df['title'])
trans_title_valid = vectorizer.transform(valid_df['title'])

In [9]:
model = LinearRegression()

In [None]:
model = model.fit(trans_title_train, train_df['log_score'])

In [40]:
prediction = model.predict(trans_title_valid)

In [41]:
prediction

array([0.21243608, 0.66057293, 0.47041384, ..., 0.29639351, 0.43302289,
       0.30101449])

In [43]:
print(valid_df.log_score[:10])
print(prediction[:10])

288798    0.301030
288799    0.301030
288800    0.698970
288801    0.301030
288802    0.301030
288803    0.301030
288804    0.602060
288805    1.041393
288806    0.954243
288807    0.301030
Name: log_score, dtype: float64
[0.21243608 0.66057293 0.47041384 0.38223478 0.47320028 0.56463441
 0.71658346 0.85987971 0.93908423 0.38712033]


In [1]:
def evaluate_by_top_score(date, score, pred):
    df = pd.DataFrame({'date':date, 'score':score, 'pred':pred})
    date_list = pd.unique(date)
    
    df['score_label'] = np.zeros(len(df), dtype=np.int32)
    df['pred_label'] = np.zeros(len(df), dtype=np.int32)
    top_score = []
    top_pred = []
    
    for day in date_list:
        df_by_date = df[df['date'] == day]
        topk = int(0.1 * len(df_by_date))
        top_score.extend(df_by_date.score.nlargest(topk).index)
        top_pred.extend(df_by_date.pred.nlargest(topk).index)
        
    df['score_label'][top_score] = 1
    df['pred_label'][top_pred] = 1
    cross = pd.crosstab(df['score_label'], df['pred_label'], rownames=["Actual"], colnames=["Predicted"])
    precision = cross[1][1] / (cross[1][1] + cross[0][1]) 
    print(cross)
    print()
    print("precision: {}".format(precision))
    
    return precision
evaluate_by_top_score(valid_df.date, valid_df.score, valid_df.pred)

NameError: name 'valid_df' is not defined