In [23]:
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
import pandas as pd
import re
from sklearn.metrics import log_loss
import spacy
import numpy as np

In [24]:
art_df = pd.read_csv('./articles_temp.csv')
ratings= pd.read_csv('./ratings_temp.csv')
art_df.dropna(inplace=True)

In [25]:
ratings_clean = list(ratings['article'])
articles_clean = list(art_df['content'])

ratings_clean = [re.sub("\d+", "", t) for t in ratings_clean]
articles_clean = [re.sub("\d+", "", t) for t in articles_clean]
ratings_clean = [re.sub("\[.*\]", "", t) for t in ratings_clean]
articles_clean = [re.sub("\[.*\]", "", t) for t in articles_clean]
ratings_clean = [re.sub("\\\\.", "", t) for t in ratings_clean]
articles_clean = [re.sub("\\\\.", "", t) for t in articles_clean]



In [26]:
nlp = spacy.load('en_core_web_sm')
nlp_rating = [[word.text for word in nlp(x) if word.pos_ in ['NOUN','VERB','ADJ']] for x in ratings_clean]
nlp_articles = [[word.text for word in nlp(x) if word.pos_ in ['NOUN','VERB','ADJ']] for x in articles_clean]

In [27]:
join_nlp_articles = [" ".join(t) for t in nlp_articles]

In [28]:
join_nlp_rating = [" ".join(t) for t in nlp_rating]

In [29]:
tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(join_nlp_rating)
y = ratings['negative']
lr = LogisticRegression(solver='lbfgs')
lr.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
X_test = tfidf.transform(join_nlp_articles)
rating_pred = lr.predict(X_test)
rating_lp = lr.predict_log_proba(X_test)

In [31]:
coef = pd.DataFrame(lr.coef_).T
coef.columns=['coefs']
coef['features']=tfidf.get_feature_names()

In [32]:
lr.score(X,y)

0.8975265017667845

In [33]:
log_loss([1,0]*2909,rating_lp)

0.6931471805599453

In [34]:
probs = pd.DataFrame(np.exp(rating_lp),columns=['notneg','neg'])

In [35]:
probs['text'] = art_df['content']

In [37]:
probs['cleantext']=join_nlp_articles

In [46]:
probs['neg'].quantile([.1,.9])

0.1    0.327794
0.9    0.567066
Name: neg, dtype: float64

In [47]:
to_rating = probs[(probs['neg']>.567)|(probs['neg']<.328)]

In [58]:
ratings.head()

Unnamed: 0.1,Unnamed: 0,id,article,rating,negative,cleantext
0,0,10007,Treasury Secretary Steven Mnuchin said hes enc...,3.0,0,said s encouraging raise debt ceiling summer b...
1,1,10008,President Donald Trump attacked the media afte...,2.0,1,attacked media news reports showed attendees h...
2,2,10041,(Reuters) - Eighty-nine Democrats in the U.S. ...,2.0,1,late last week have said support starting impe...
3,3,10262,A bipartisan bill that would limit the increas...,3.0,0,bipartisan bill that would limit increase drug...
4,4,10304,"Sen. John Cornyn, R-Texas, the number two Repu...",3.0,0,number speaks aide markup session vote new fed...


In [55]:
to_rating.head()

Unnamed: 0,notneg,neg,text,cleantext,id,negative
0,0.307652,0.692348,(Reuters) - Eighty-nine Democrats in the U.S. ...,late last week have said support starting impe...,0,1
1,0.307652,0.692348,(Reuters) - Eighty-nine Democrats in the U.S. ...,late last week have said support starting impe...,1,1
3,0.340919,0.659081,Donald Trump held another neo-fascist rally ye...,held neo fascist rally yesterday crowd chanted...,3,1
6,0.265563,0.734437,"Michael D'Antonio is the author of the book ""N...",is author book co author opinions expressed co...,6,1
18,0.672913,0.327087,President Donald Trump speaks with reporters o...,speaks reporters departing Going Democratic co...,18,0


In [63]:
to_rating['id']=to_rating.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [57]:
ratings['cleantext']=join_nlp_rating

In [54]:
to_rating['negative']=to_rating['neg'].apply(lambda x: 1 if x>.5 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [67]:
ratings_out = ratings[['id','article','cleantext','negative']].append(to_rating[['id','text','cleantext','negative']])

In [68]:
ratings_out.to_csv('./ratings_out.csv')