In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets, random_projection 
from sklearn.decomposition import PCA
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.base import TransformerMixin, BaseEstimator
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
files = ['examplefilename1.csv', 'examplefilename2.csv']

In [3]:
dfs = [pd.read_csv(file) for file in files]

df = pd.concat(dfs, ignore_index=True).reset_index()
df.drop_duplicates(subset = ['url', 'story_content'], inplace = True)
df.dropna(subset = ['story_content'], inplace = True)

In [4]:
len(df['url'].unique()), len(df['url'])

(375, 400)

In [6]:
df = df.drop('Unnamed: 0', 1)
df.dropna(subset =['url'], how = 'all',  inplace = True)

In [7]:
df['story_content_parsed'] = df['story_content'].copy()

In [8]:
df['story_content_parsed']= df['story_content_parsed'].str.replace('\\n', ' ').replace('\n', ' ')

In [9]:
df['figure_caption_parsed']= df['figure_caption'].copy()

In [10]:
import re

In [11]:
remove_bracket_content = lambda x: re.sub("[\(\[].*?[\)\]]", "", str(x))

In [12]:
df['figure_caption_parsed'] = df['figure_caption_parsed'].apply(remove_bracket_content)

In [13]:
header_split = lambda x: str(x).split('\n')

In [14]:
df['split_header'] = df['header_content'].apply(header_split)

In [15]:
def header_parse(x):
    
    if 'LIVE' in x:
        title = ''.join(str(i)+' ' for i in x)
        topic = 'News Livestream'
        author = 'none'

    elif 'LIVE' not in x and len(x) >0:
        title = ''.join(str(i)+' ' for i in x[:-2])
        topic = str(x[0])
        author = str(x[-1])
        
    return title, topic, author

In [16]:
def title_parse(x):
        
    if 'LIVE' in x:
        title = ''.join(str(i)+' ' for i in x)

    elif 'LIVE' not in x and len(x) >0:
        title = ''.join(str(i)+' ' for i in x[:-2])

    return title

In [17]:
def topic_parse(x):
    
    if 'LIVE' in x:
        topic = 'News Livestream'

    elif 'LIVE' not in x and len(x) >0:
        topic = str(x[0])
        
    return topic

In [18]:
def author_parse(x):
    
    if 'LIVE' in x:
        author = 'none'

    elif 'LIVE' not in x and len(x) >0:
        author = str(x[-1])
        
    return author

In [19]:
def news_topic_parse(x):
    
    url_str = str(x)
    
    if re.search('news/(.*)/',url_str):
        news_topic = re.search('news/(.*)/',url_str).group(1)
        return news_topic
    
    else:
        return 0
    

In [20]:
df['source'] = 'cbc'

In [21]:
df['title'] = df['split_header'].apply(lambda x: title_parse(x))
df['topic'] = df['split_header'].apply(lambda x: topic_parse(x))
df['author'] = df['split_header'].apply(lambda x: author_parse(x))
df['news_topic']=df['url'].apply(lambda x: news_topic_parse(x))

In [22]:
df['headline'] = df['title'].copy()

In [23]:
#df.head()

In [24]:
rebel_files = ['examplerebelfile1.csv', 
               'examplerebelfile2.csv']

In [25]:
rebel_dfs = [pd.read_csv(file) for file in rebel_files]

rebel_df = pd.concat(rebel_dfs, ignore_index=True).reset_index()

rebel_df.drop_duplicates(subset = ['url', 'story_content'], inplace = True)

In [26]:
len(rebel_df['url'].unique()), len(rebel_df['url'])

(206, 206)

In [27]:
rebel_df = rebel_df.drop('Unnamed: 0', 1).reset_index()
rebel_df.dropna(subset =['url'], how = 'all',  inplace = True)

In [28]:
rebel_df['story_content_parsed']= rebel_df['story_content'].copy().str.replace('\\n', ' ').replace('\n', ' ')

In [29]:
rebel_df['split_header'] = rebel_df['header_content'].apply(header_split)

In [30]:
def rebel_headline_parse(x):
    if len(x)>1:
        headline = str(x[1])
    else:
        headline = 0
    return headline

def rebel_publish_date_parse(x):
    if len(x)>0:
        publish_date = str(x[0])
    else:
        publish_date = 0
    return publish_date

def rebel_author_parse(x):
    if len(x)>2:
        author = str(x[2])
    else:
        author = 0
    return author

In [31]:
rebel_df['author'] = rebel_df['split_header'].apply(lambda x: rebel_author_parse(x))
rebel_df['publish_date'] = rebel_df['split_header'].apply(lambda x: rebel_publish_date_parse(x))
rebel_df['headline'] = rebel_df['split_header'].apply(lambda x: rebel_headline_parse(x))

In [32]:
rebel_df['source'] = 'rebel media'

In [33]:
rebel_df['news_topic'] = 'rebel'

In [34]:
star_tabloid_files = ['examplestarfile1.csv',
                     'examplestarfile2.csv']

In [35]:
star_tabloid_dfs = [pd.read_csv(file) for file in star_tabloid_files]

star_tabloid_df = pd.concat(star_tabloid_dfs, ignore_index=True).reset_index()

star_tabloid_df.drop_duplicates(subset = ['url', 'story_content'], inplace = True)

In [36]:
len(star_tabloid_df['url'].unique()), len(star_tabloid_df['url'])

(176, 227)

In [37]:
star_tabloid_df = star_tabloid_df.drop('Unnamed: 0', 1).reset_index()
star_tabloid_df.dropna(subset =['url'], how = 'all',  inplace = True)

In [38]:
star_tabloid_df['story_content_parsed']= star_tabloid_df['story_content'].copy().str.replace('\\n', ' ').replace('\n', ' ')

In [39]:
len(star_tabloid_df[star_tabloid_df['story_content_parsed']=='Powered By']), len(star_tabloid_df['story_content_parsed'])

(21, 226)

In [40]:
star_tabloid_df = star_tabloid_df.drop(star_tabloid_df.index[star_tabloid_df['story_content_parsed'] == 'Powered By'])

In [41]:
len(star_tabloid_df[star_tabloid_df['story_content_parsed']=='Powered By']), len(star_tabloid_df['story_content_parsed'])

(0, 205)

In [42]:
#star_tabloid_df.head()

In [43]:
star_tabloid_df['source'] = 'starmagazine'

In [44]:
star_tabloid_df['split_header'] = star_tabloid_df['header_content'].apply(header_split)

In [45]:
def star_get_author_date(x):
    if len(x)>1:
        author_date = str(x[-1])
    else:
        author_date = 0
    return author_date

def star_get_headline(x):
    if len(x)>0:
        headline = ''.join(str(i)+' ' for i in x[:-1])
    else:
        headline = 0
    return headline    
    

In [46]:
star_tabloid_df['headline'] = star_tabloid_df['split_header'].apply(lambda x: star_get_headline(x))
star_tabloid_df['author_date'] = star_tabloid_df['split_header'].apply(lambda x: star_get_author_date(x))
star_tabloid_df['news_topic'] = 'gossip'

In [47]:
#star_tabloid_df.head()

In [48]:
cbc_min_df = df[['headline','story_content_parsed', 'news_topic', 'source']].copy()
rebel_min_df = rebel_df[['headline', 'story_content_parsed', 'news_topic', 'source']].copy()
star_min_df = star_tabloid_df[['headline', 'story_content_parsed', 'news_topic', 'source']].copy()

frames = [cbc_min_df, rebel_min_df, star_min_df]

In [49]:
article_df = pd.concat(frames)

In [50]:
#article_df.head()
#article_df['headline'] 

In [51]:
make_string = lambda x: str(x).strip()

In [52]:
def make_string(x):
    return(str(x).strip())

In [53]:
def clean_string(x):
    ''.join(x.split("\\"))
    x.replace('( )', '').replace('. "', '."')
    return x

In [54]:
article_df['headline'] = article_df['headline'].apply(make_string).apply(lambda x: clean_string(x)).copy()
article_df['story_content_parsed'] = article_df['story_content_parsed'].apply(make_string).apply(lambda x: clean_string(x)).copy()
article_df['news_topic'] = article_df['news_topic'].apply(make_string).apply(lambda x: clean_string(x)).copy()
article_df['source'] = article_df['source'].apply(make_string).apply(lambda x: clean_string(x)).copy()

In [55]:
#NOTE - will NOT want to do this when looking just at headlines instead of story content
#this code is for classifying body text
article_df = article_df.drop(article_df.index[article_df['story_content_parsed'] == 'nan'])

In [56]:
X = article_df[['headline','story_content_parsed']].reset_index()
y = article_df[['news_topic', 'source']].reset_index()

X = X.drop('index', 1)
y = y.drop('index', 1)

X = X.fillna(value = 'empty')
y = y.fillna(value= 'empty')

In [57]:
import nltk
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,roc_auc_score,roc_curve,auc

In [58]:
X.shape, y.shape

((690, 2), (690, 2))

In [59]:
custom_stop_words =['warning', 'star', 'language warning', 'cbc', 'Rebel', 'language', '( )', '. “','. ``', '00', '( )', 'i' , 'a', 'magazine', 'ok', 't', '``', 'weekly', '“', '”','s', '‘', "'s", '’', "'re", "n't", 'didn', 'powered', "ikea®", 'www.ikea.com/us/kitchens', 'ikea', 've', 'aug.', 'ca', 'l', 'la', 'rebel', "'m", 'kitchen', 'kitchens', 'quantico' ]

In [60]:
custom_punct = list(string.punctuation)
custom_punct.append('""')
custom_punct.append("''")
custom_punct.remove('!')
custom_punct.remove('?')
custom_punct.remove('-')

In [61]:
count_vect = CountVectorizer(analyzer='word', stop_words=custom_stop_words + custom_punct, tokenizer=word_tokenize)


In [62]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

In [63]:
X = X['story_content_parsed']
y = y['source']
#y = y['news_topic']

In [64]:
def pos_tag_and_flat(x):
    text = nltk.tokenize.word_tokenize(x)
    pos = nltk.pos_tag(text)
    x_flat = [e for l in pos for e in l]
    str1 = ' '.join(str(e) for e in x_flat)
    return str1

In [65]:
def pos_tag_only(x):
    text = nltk.tokenize.word_tokenize(x)
    pos = nltk.pos_tag(text)
    x_pos = [tup[0] if '!' in tup else tup[0] if '?' in tup else tup[0] if '-' in tup else tup[1] for tup in pos ]
    x_pos_str = ' '.join(str(e) for e in x_pos)
    return x_pos_str


In [66]:
def replace_names(text):
    first_names_df = pd.read_csv('first_names.txt', header = None, sep = ',')
    most_common_first_names_df = first_names_df.nlargest(800, [2]).reset_index()
    common_names = ['Brad', 'Harry', 'Kris', 'Kylie', 'Don', 'Madonna', 'Shia', 'Tamron', 'Kim', 'Gwenyth', 'Leonardo', 'Mathew', 'Macaulay', 'Farrah', 'Beckinsale', 'Dale', 'Polanski', 'A-Rod', 'Bette', 'Mel', 'Bella']
    for n in range(0, len(most_common_first_names_df)):
        common_names.append(most_common_first_names_df[0][n])
    for name in common_names:
        new_text = text.replace(name, 'NAME1')
    return (str(new_text))

In [67]:
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [68]:
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [69]:
class make_string_class(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, docs):
        string_class = [make_string(text)
                for text in docs]
        string_series = pd.Series(string_class)
#        print("after make_string_class the type is:", type(string_series), string_series.shape)
        return(string_series)

In [70]:
class pos_tag_and_flat_class(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, docs):
        pos_tag_list = [pos_tag_and_flat(text) for text in docs.iloc[:, 0]]
        pos_tag_series = pd.Series(pos_tag_list)
#        print("after pos_tag_and_flat_class:", type(pos_tag_series), pos_tag_series.shape)
        return(pos_tag_series)

In [71]:
class pos_tag_only_class(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, docs):
        pos_tag_list = [pos_tag_only(text) for text in docs.iloc[:, 0]]
        pos_tag_series = pd.Series(pos_tag_list)
#        print("after pos_tag_and_flat_class:", type(pos_tag_series), pos_tag_series.shape)
        return(pos_tag_series)

In [72]:
class replace_names_class(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, docs):
        replace_names_data = [replace_names(text) for text in docs]
        name1_names = pd.Series(replace_names_data)
#        print("after replace_names_class:", name1_names.shape)
        return(name1_names)

In [73]:
class make_df_class(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, docs):
        list_df = []
        for text in docs:
            list_df.append(text)
        df_mid = pd.DataFrame(list_df, columns = ['input'])
        print("after make_df_class:", type(df_mid), df_mid.shape)
        return(df_mid)

In [74]:
clean_content_pipeline = Pipeline([
    ('makestr', make_string_class()),
    ('make_df', make_df_class()),
    ('pos_tag_flat', pos_tag_only_class()),
    ('name1', replace_names_class())
])

In [75]:
ppl1 = Pipeline([
              ('clean_input', clean_content_pipeline),
              ('vectorizer', CountVectorizer(ngram_range=(1, 3), analyzer='word', stop_words=custom_stop_words + custom_punct, tokenizer=word_tokenize)),
              ('to_dense', DenseTransformer()),
              ('clf',   LogisticRegression())
      ])

# train the classifier AFTER applying tagging
content_model = ppl1.fit(X, y)

# test the classifier AFTER applying tagging
#y_pred = model.predict(X_test)
#y_pred_proba = model.predict_proba(X_test)

after make_df_class: <class 'pandas.core.frame.DataFrame'> (690, 1)


In [76]:
import pickle
from sklearn.externals import joblib
joblib.dump(content_model, 'news_content_classification_model.pkl') 

['news_content_classification_model.pkl']

In [77]:
#try a few example content strings to see output
content_Googl = ["Also, when considering the costs and benefits, we should keep in mind that Google's funding is finite so its allocation is more zero-sum than is generally acknowledged. The harm of Google’s biases I strongly believe in gender and racial diversity, and I think we should strive for more. However, to achieve a more equal gender and race representation, Google has created several discriminatory practices: ● Programs, mentoring, and classes only for people with a certain gender or race5 ● A high priority queue and special treatment for “diversity” candidates ● Hiring practices which can effectively lower the bar for “diversity” candidates by decreasing the false negative rate ● Reconsidering any set of people if it’s not “diverse” enough, but not showing that same scrutiny in the reverse direction (clear confirmation bias) ● Setting org level OKRs for increased representation which can incentivize illegal discrimination6 ____________________________________________________________________________ 5 Stretch, BOLD, CSSI, Engineering Practicum (to an extent), and several other Google funded internal and external programs are for people with a certain gender or race. 6 Instead set Googlegeist OKRs, potentially for certain demographics. We can increase representation at an org level by either making it a better environment for certain groups (which would be seen in survey scores) or discriminating based on a protected status (which is illegal and I’ve seen it done). Increased representation OKRs can incentivize the latter and create zero-sum struggles between orgs. These practices are based on false assumptions generated by our biases and can actually increase race and gender tensions. We’re told by senior leadership that what we’re doing is both the morally and economically correct thing to do, but without evidence this is just veiled left ideology7 that can irreparably harm Google. Why we’re blind We all have biases and use motivated reasoning to dismiss ideas that run counter to our internal values. Just as some on the Right deny science that runs counter to the “God > humans > environment” hierarchy (e.g., evolution and climate change), the Left tends to deny science concerning biological differences between people (e.g., IQ8 and sex differences). Thankfully, climate scientists and evolutionary biologists generally aren’t on the right. Unfortunately, the overwhelming majority of humanities and social sciences lean left (about 95%), which creates enormous confirmation bias, changes what’s being studied, and maintains myths like social constructionism and the gender wage gap9. Google’s left leaning makes us blind to this bias and uncritical of its results, which we’re using to justify highly politicized programs. In addition to the Left’s affinity for those it sees as weak, humans are generally biased towards protecting females. As mentioned before, this likely evolved because males are biologically disposable and because women are generally more cooperative and agreeable than men. We have extensive government and Google programs, fields of study, and legal and social norms to protect women, but when a man complains about a gender issue issue affecting men, he’s labelled as a misogynist and a whiner10. Nearly every difference between men and women is interpreted as a form of women’s oppression. As with many things in life, gender differences are often a case of “grass being greener on the other side”; unfortunately, taxpayer and Google money is being spent to water only one side of the lawn. "]

In [78]:
content1 = ["Shaun Weiss, who played the roly-poly goalie in Disney’s 1992 hockey hit, “The Mighty Ducks,” has been sent to the penalty box for petty theft — and will serve 150 days in L.A. County Jail! The 38-year-old Weiss was nabbed in July for swiping $151 in merchandise from an electronics store — his second shoplifting offense. “Hopefully, when he gets out, he will go to rehab so he can stay sober,” says Don Gibble, the troubled star’s manager."]

In [79]:
email_cont = ["Hi Christian, Laura, We have a rare opportunity to meet together. Laura and I will be attending a wedding in Vancouver. I'd be delighted if, by some fortuitous chance, the three of us could meet for coffee, lunch (or other) to catch up and share ideas! Alas, I realize that our schedules may not align but I thought it would be worth a try. Let me know if you have any availability :) All the best,Stefan"]

In [80]:
blog_post = ["When I feel hopeless and impotent, the quickest cure is to be of service in some way. When I read the news of the disgusting (and wholly unsurprising, for those who’ve been following along at home) racist march in Charlottesville, I wanted to know how I could help. Well, first I got angry. And I said some unkind things. (I honestly feel pretty okay about most of those things.) But then I remembered the other part, the part where I feel better when I do something. Maybe it’s selfish in a sense, but if it helps folks, maybe it’s the good kind of selfish. Anyway, some simple Google searching led me to write a Twitter thread on local nonprofits (you can find it here.) But since not everybody is on Twitter, I figured I’d write a quick blog post that you can share on FB, via email, or however you like. Here are some places that would likely be grateful for your support during this troubling time (and, to be honest, at any time — nonprofits can usually use the help.) Disclaimer: I don’t work for any of these organizations and I’ve never worked with them. But I found some good information online that pointed to them as important pillars of the community (and a few suggestions came through after I wrote my original thread), so I’m sharing them here. Hope you’ll find something that appeals to you. Even a little bit of money helps. NAACP Albemarle-Charlottesville (Branch 7057) merged two NAACP branches in 2001. The Albemarle branch was founded in 1953 and the Charlottesville branch in 1947. Black Student Alliance at UVA is doing the work (consider sharing some kind words on Twitter, too.) Charlottesville Pride is an LGBTQ organization that runs a variety of programs and events in Charlottesville. Planned Parenthood South Atlantic serves various communities, Charlottesville among them. National Organization for Women, Charlottesville chapter seeks to empower women in Charlottesville and beyond. Meals on Wheels of Charlottesville serves nutritious meals to many individuals, particularly homebound seniors who may have no other visitors. African American Teaching Fellows is an organization working to increase diversity among teaching staff in a system where only 10% of educators are African-America. Brody Jewish Center of the University of Virginia is the Hillel branch at UVA (and here’s a good primer on the long history of the Jewish community in Charlottesville, thanks to the Institute of Southern Jewish Life). Congregation Beth Israel is the only synagogue in Albemarle County. Drop them a kind email and tell them you’d like to donate. IMPACT Charlottesville is an interfaith organization working for social justice. The Women’s Initiative provides mental healthcare to women regardless of a patient’s ability to pay. The Arc of the Piedmont helps adults with intellectual and developmental disabilities. The Virginia Centers for Independent Living comprise an organization that helps adults with disabilities to lead independent lives as full members of their communities. Big Brothers Big Sisters of the Central Blue Ridge provides mentorship to young people in the Charlottesville area. Piedmont Housing Alliance helps diverse clientele in the Charlottesville area access affordable housing and attain financial solubility and independence. Legal Aid Justice Center provides legal assistance to low-income individuals and seeks equal justice for all who live in Virginia. Beloved Community Charlottesville specifically seeks to meet hatred with love in a very creative way. The Haven provides respite and care for homeless folks in downtown Charlottesville. Interntional Rescue Committee has an office in Charlottesville, where they work to settle refugees in the surrounding area. (Thank you Lauren Kathryn Berry-Kagan for commenting with the helpful link.) Sexual Assault Resource Agency (SARA) does extraordinary work. Great Expectations is a personal favorite of mine. They work through the whole state of Virginia to help foster youth (a vulnerable and diverse population of young people) transition to work and community college. These kids usually don’t have much or any support from their family of origin, financially or otherwise. They may have bounced around to various homes and endured a lot of trauma in their lives. GE provides mentorship and guidance for the hundreds of youth who age out of foster care and services each year. Now, Virginia is a diverse and beautiful state. (My main problem with Virginia is that it takes so damn long to drive through, but otherwise, I’m a fan.) The people who marched the other night don’t represent all Virginians, or even most Virginians. (And yeah, some of them were from out of state, but I’m sure plenty of ’em were from in-state. That’s not my point here.) There are a lot of good people in Virginia, and many of them benefit from, work for, volunteer with, or donate to the organizations listed above. I leave you with this: VA is the birthplace of one @MissyElliott. That alone is evidence the place can yield greatness. Don’t lose hope just because some people are the worst. Some people are the best, too. The rest of us are somewhere in between, but we keep striving for better. Charitable acts are one way to help us get there."]

In [81]:
print(len(content1))

1


In [82]:
y_pred_proba_sample = content_model.predict_proba(content_Googl)
print(len(y_pred_proba_sample))
y_pred_proba_sample_new = np.split(y_pred_proba_sample, 3, axis =1)
if y_pred_proba_sample_new[0]*100 > 30:
    print(y_pred_proba_sample_new[0]*100, "greater than 30")
sample_news_likeness = str(y_pred_proba_sample_new[0]*100).replace('[', '').replace(']', '')
sample_hyperpartisan_opinion_likeness = str(y_pred_proba_sample_new[1]*100).replace('[', '').replace(']', '')
sample_tabloid_likeness = str(y_pred_proba_sample_new[2]*100).replace('[', '').replace(']', '')

print("News likeness score is:", sample_news_likeness, "\nHyperpartisan trash likeness score is:", sample_hyperpartisan_opinion_likeness, "\nTabloid likeness score is:", sample_tabloid_likeness)

after make_df_class: <class 'pandas.core.frame.DataFrame'> (1, 1)
1
News likeness score is:  3.53751879 
Hyperpartisan trash likeness score is:  96.46248121 
Tabloid likeness score is:   4.27432153e-17


In [83]:
coefs_cbc = ppl1.named_steps['clf'].coef_[0]
coefs_rebel = ppl1.named_steps['clf'].coef_[1]
coefs_star = ppl1.named_steps['clf'].coef_[2]

In [84]:
feature_names = ppl1.named_steps['vectorizer'].get_feature_names()

In [85]:
sorted(list(zip(feature_names, ppl1.named_steps['clf'].coef_)))

[('!', array([ -6.17553915e-01,   4.71026232e-06,   2.35513116e-06, ...,
          -7.30326844e-05,  -6.61877123e-05,  -6.61877123e-05])),
 ('! !', array([ -3.37933044e-01,  -2.87626309e-06,  -1.43813155e-06, ...,
           8.90150870e-05,   1.08190832e-04,   1.08190832e-04])),
 ('! ! !', array([  6.70326944e-01,  -8.73160843e-07,  -4.36580422e-07, ...,
          -7.31069239e-06,  -1.40270734e-04,  -1.40270734e-04]))]

In [86]:
sorted(list(zip(feature_names, coefs_rebel)), key = lambda x: (x[1]), reverse = True)

[('?', 0.45592544667399587),
 ('dt nnp', 0.36777631213773743),
 ('in prp vbp', 0.34494234794011169),
 ('in vbg', 0.33427457982324277),
 ('wdt', 0.32443987497841154),
 ('wp', 0.27824712792792822),
 ('to vb', 0.27345770769398803),
 ('jj nn cc', 0.26826746428157044),
 ('dt', 0.26097015122204165),
 ('prp nn to', 0.2572688461694207),
 ('dt jj nns', 0.24113009819011458),
 ('jj nn', 0.23601192231414866),
 ('vbp prp', 0.22828968042706543),
 ('nn vbg', 0.2267786251395815),
 ('prp vbd', 0.2233125770658537),
 ('prp in', 0.219940670838876),
 ('vbd to vb', 0.20423276493523412),
 ('dt in dt', 0.197491340464441),
 ('fw', 0.1944665752766048),
 ('? prp', 0.19063021235905797),
 ('nnp prp nn', 0.18149170721383481),
 ('in vbg dt', 0.17864162187095434),
 ('nnp nnp vbz', 0.17616471698510155),
 ('vbg jj', 0.1721322513484197),
 ('vbp jj', 0.16982613169022995),
 ('prp vbd to', 0.1695726117139193),
 ('rb dt', 0.16309636544587588),
 ('in nn nnp', 0.16215581945606269),
 ('in', 0.16086450350535941),
 ('cc in', 0.1

In [87]:
sorted(list(zip(feature_names, coefs_cbc)), key = lambda x: (x[1]), reverse = True)

[('pos', 0.68491051310789086),
 ('vbd', 0.34931609408169068),
 ('nnp pos', 0.34012795865313089),
 ('nn pos', 0.3311752251327188),
 ('cd nnp', 0.26809090627606297),
 ('vbn in', 0.25987666568714757),
 ('cd', 0.25878574526585257),
 ('prp md', 0.25785647559422764),
 ('nn nnp', 0.23088340158648274),
 ('vbz vbg', 0.22876089023162535),
 ('nn in nnp', 0.22749365035539482),
 ('in cd', 0.22215841676276227),
 ('nnp nnp nnp', 0.21810791507293176),
 ('rb vb', 0.21754515064807173),
 ('md', 0.21227694923473558),
 ('prp vbp vbg', 0.21210677683368451),
 ('vbd nnp', 0.20845507502537797),
 ('vb dt nn', 0.20368335915142127),
 ('prp vbz', 0.20282194401949083),
 ('nns in dt', 0.19817010015547645),
 ('in dt nn', 0.1964976977825173),
 ('dt nn pos', 0.19461148456594587),
 ('nnp nn in', 0.19235443739063587),
 ('nnp vbd prp', 0.19157979959477425),
 ('in in', 0.18196019495700452),
 ('pos nnp', 0.18135005410424102),
 ('nnp in', 0.18115568700221454),
 ('in prp nn', 0.17716233923529168),
 ('vb dt', 0.175275522755104

In [88]:
sorted(list(zip(feature_names, coefs_star)), key = lambda x: (x[1]), reverse = True)

[('!', 0.6703269442423172),
 ('in prp', 0.32537227738206648),
 ('dt nns', 0.30335136296968912),
 ('vbz dt', 0.28058735343027169),
 ('nn vb', 0.257094725991657),
 ('nnp dt', 0.23368447612126561),
 ('vbz dt nn', 0.23325347317468964),
 ('nns vbp', 0.2328886141650226),
 ('! nn', 0.22843548072205314),
 ('prp vbz', 0.22786881363584216),
 ('nnp !', 0.22670811476418909),
 ('nnp cd', 0.21632916948911624),
 ('nn prp vbp', 0.20221812486270774),
 ('cd', 0.20200001884276894),
 ('cd vbd', 0.19548945850061467),
 ('in nnp rb', 0.18815353530060661),
 ('prp nn cc', 0.18804520792350635),
 ('in prp vbz', 0.17776589129840942),
 ('nnp nnp !', 0.17615530657727238),
 ('nn in prp', 0.17572842829657148),
 ('prp jj', 0.17514408163896816),
 ('dt jj nn', 0.17227382145210027),
 ('cc nnp', 0.17117331769801641),
 ('jj rb', 0.16899135169646864),
 ('nnp nnp nn', 0.16700342776452101),
 ('vb', 0.16318543532511448),
 ('nnp cd vbd', 0.16297047706610746),
 ('nnp vbz dt', 0.16295598154843222),
 ('nn cd', 0.15927032282073889)

### Code below is for classifying headlines

In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

In [90]:
X = article_df[['headline','story_content_parsed']].reset_index()
y = article_df[['news_topic', 'source']].reset_index()

X = X.drop('index', 1)
y = y.drop('index', 1)

X = X.fillna(value = 'empty')
y = y.fillna(value= 'empty')

In [91]:
X = X['headline']
y = y['source']


In [92]:
headline_stop_words = ['Trudeau', 'Trump', 'alberta', 'live','bc', 'listen', 'trump', 'trudeau', 'mcinnes', 'name1m', 'daily', 'roundup', 'streams', 'stream', 'eclipse', 'toronto', 'edge', 'celebrity', 'montreal', 'solar', 'radio', 'introducing', 'notley', 'tommy', 'robinson', 'ann', 'coulter', 'thicke', 'kardashian', 'kim']

In [93]:
class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self
    print("Here at TextStats")    
    def transform(self, docs):
        return [{'length': len(text),
                 'num_sentences': text.count('.'),
                'sentiment_score_pos': TextBlob(text).sentiment[0],
                'subjectivity_score': TextBlob(text).sentiment[1]}
                for text in docs]

Here at TextStats


In [94]:
text_stats_pipeline = Pipeline([
                    ('stats', TextStats()),
                    ('vect', DictVectorizer())
                ])


In [95]:
clean_headline_pipeline = Pipeline([
    ('makestr', make_string_class()),
    ('make_df', make_df_class()),
    ('pos_tag_flat', pos_tag_and_flat_class()),
    ('name1', replace_names_class())
])

In [96]:
ppl2 = Pipeline([
        ('clean_headline', clean_headline_pipeline),
        ('union', FeatureUnion(
            transformer_list=[
                ('textstats', text_stats_pipeline),

                ('headline_analysis', Pipeline([
                    ('vectorizer', CountVectorizer(ngram_range=(1, 3), analyzer='word', stop_words=custom_stop_words + custom_punct + headline_stop_words, tokenizer=word_tokenize)),
                ]))
           ],
            transformer_weights= {
            'textstats': 1.0,
            'headline_analysis': 1.0,
        },
        )),
              ('to_dense', DenseTransformer()),
              ('clf',   LogisticRegression())
      ])
         
    
# train the classifier AFTER applying tagging
headline_model = ppl2.fit(X, y)


after make_df_class: <class 'pandas.core.frame.DataFrame'> (690, 1)


In [97]:
joblib.dump(headline_model, 'news_headline_classification_model.pkl') 

['news_headline_classification_model.pkl']

In [98]:
test1 = "INTO THE SIN BIN! ‘Mighty Ducks’ Kid Scores … Jail Time Child star grew up to be chronic shoplifter!"
test2 = ["Colin Kaepernick to Get His Own Display at the Smithsonian’s National Museum of African American History and Culture"]
test3 = ["Steve Bannon fired as Trump White House's top strategist"]
test4 = ["Alaska Warily Eyes Change Bringing Suburbs and Amazon Boxes"]
test5 = ["Trump expected to announce small troop increase for Afghanistan in prime time address"]

In [99]:
testid = []
testid.append(''.join(test1))
testid

['INTO THE SIN BIN! ‘Mighty Ducks’ Kid Scores … Jail Time Child star grew up to be chronic shoplifter!']

In [100]:
headline = testid

y_pred_proba_sample = headline_model.predict_proba(headline)
y_pred_proba_sample_new = np.split(y_pred_proba_sample, 3, axis =1)
sample_news_likeness = str(y_pred_proba_sample_new[0]*100).replace('[', '').replace(']', '')
sample_hyperpartisan_opinion_likeness = str(y_pred_proba_sample_new[1]*100).replace('[', '').replace(']', '')
sample_tabloid_likeness = str(y_pred_proba_sample_new[2]*100).replace('[', '').replace(']', '')

print("News likeness score is:", sample_news_likeness, "\nHyperpartisan trash likeness score is:", sample_hyperpartisan_opinion_likeness, "\nTabloid likeness score is:", sample_tabloid_likeness)

after make_df_class: <class 'pandas.core.frame.DataFrame'> (1, 1)
News likeness score is:  0.03554749 
Hyperpartisan trash likeness score is:  8.25624149 
Tabloid likeness score is:  91.70821101


In [101]:
TextBlob(str(headline)).sentiment

Sentiment(polarity=0.1375, subjectivity=0.45)

In [102]:
coefs_cbc = ppl2.named_steps['clf'].coef_[0]
coefs_rebel = ppl2.named_steps['clf'].coef_[1]
coefs_star = ppl2.named_steps['clf'].coef_[2]

In [103]:
u = ppl2.named_steps['union']

In [104]:
u.transformer_list[1][1]

Pipeline(steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None,
        stop_words=['war...n='(?u)\\b\\w\\w+\\b',
        tokenizer=<function word_tokenize at 0x110796f28>, vocabulary=None))])

In [105]:
p = u.transformer_list[1][1]

In [106]:
v = p.named_steps["vectorizer"]

In [107]:
s = u.transformer_list[0][1]

In [108]:
s2 = s.named_steps['vect']

In [109]:
text_influence = s2.get_feature_names()

In [110]:
sorted(list(zip(text_influence, coefs_rebel)), key = lambda x: (x[1]), reverse = True)

[('subjectivity_score', 0.21224956923136432),
 ('sentiment_score_pos', 0.067365750481775233),
 ('length', -0.018165332618418562),
 ('num_sentences', -0.63673474197934365)]

In [111]:
feature_names = v.get_feature_names()

In [112]:
sorted(list(zip(feature_names, coefs_rebel)), key = lambda x: (x[1]), reverse = True)

[('nn ! hall', 0.53270287066718236),
 ('nnp nnp was', 0.53247609165532073),
 ('nnp vbd baby', 0.53074908246744135),
 ('in nnp barcelona', 0.49432212786692126),
 ('is vbz bad', 0.48820313370180812),
 ('ndp nnp keeps', 0.44922180549964169),
 ('ndp nnp leader', 0.44922180549964169),
 ('nn tricks', 0.44667309458553572),
 ('nn tricks nns', 0.44667309458553572),
 ('is nnp scary', 0.41138562347619062),
 ('conservative jj parties', 0.39787071838272542),
 ('nn nnp bribe', 0.3975441264677938),
 ('? are nnp', 0.3906850504669882),
 ('nnp ? emily', 0.37030598149429972),
 ("nns 'are vbp", 0.36722236450197054),
 ('nnp nelson', 0.34314120102704176),
 ('nnp nelson nnp', 0.34314120102704176),
 ('jj ! plus', 0.31698335896160368),
 ('antifa nnp violence', 0.30526713073516504),
 ('anyone', 0.30526713073516504),
 ('nnp include', 0.30351730915366237),
 ('why wrb did', 0.28656693109951326),
 ('why wrb eco-radicals', 0.28656693109951326),
 ('jjr apart rb', 0.27307086502532346),
 ('we prp don', 0.26748427365158

In [113]:
sorted(list(zip(text_influence, coefs_cbc)), key = lambda x: (x[1]), reverse = True)

[('length', 0.01083902147286841),
 ('sentiment_score_pos', -0.0056502962502576227),
 ('subjectivity_score', -0.019093188453044078),
 ('num_sentences', -0.51458572730772656)]

In [114]:
sorted(list(zip(feature_names, coefs_cbc)), key = lambda x: (x[1]), reverse = True)

[('nnp nnp active', 0.8862278290899851),
 ('nnp nnp on', 0.81212548217721836),
 ('pos 10 cd', 0.70377832743007296),
 ('more rbr plastic', 0.56707044933865736),
 ('more rbr top', 0.56707044933865736),
 ('nnp mount', 0.56707044933865736),
 ("in 'agt nns", 0.56453876934297531),
 ('opinion nn attacks', 0.54095834367946127),
 ('nnp motion nn', 0.52822088447898397),
 ('analysis nnp an', 0.46471336134795238),
 ('charlottesville nnp prp', 0.43364555288089479),
 ('charlottesville nnp remembered', 0.43364555288089479),
 ('opinion nnp google', 0.39090356643531132),
 ('analysis nnp nnp', 0.38836086580135804),
 ('more jjr reason', 0.36379374635496131),
 ('u.s. nnp marshals', 0.2890025487220147),
 ('u.s. nnp signal', 0.2890025487220147),
 ('md be vb', 0.28887218193287778),
 ('to 478,696 cd', 0.28865199826524679),
 ('nn pos canadian', 0.27326269928549707),
 ('into in nnp', 0.25884139963706426),
 ('in chris', 0.25085587861972497),
 ('in chris nnp', 0.25085587861972497),
 ('into in night', 0.2490522757

In [115]:
sorted(list(zip(text_influence, coefs_star)), key = lambda x: (x[1]), reverse = True)

[('num_sentences', 1.3231649684928581),
 ('length', -0.01878817860985605),
 ('sentiment_score_pos', -0.17505996257636375),
 ('subjectivity_score', -0.30986623674425601)]

In [116]:
sorted(list(zip(feature_names, coefs_star)), key = lambda x: (x[1]), reverse = True)

[("! 'bachelorette", 1.3231649684928581),
 ("nnp ! 'she", 0.62502510870268935),
 ("prp 'fantastic", 0.55964474236519401),
 ("! 'power jjr", 0.42289666059357067),
 ("dt 'chilling vbg", 0.36764209678105469),
 ('she prp can', 0.35206385215977731),
 ('she prp did', 0.35206385215977731),
 ("vbg 'briefcase law", 0.31843410382318005),
 ('her nnp businessman', 0.31795638333869453),
 ('jj nnp amber', 0.3165467673331871),
 ("the dt 'leftovers", 0.24811877531915666),
 ("the dt 'quantico", 0.24811877531915666),
 ('says vbz an', 0.24338523594299877),
 ('says vbz dj', 0.24338523594299877),
 ('news nnp posted', 0.24075138322927506),
 ('nnp nn cut', 0.20513047409328863),
 ('cc a-rod nnp', 0.20068603484819925),
 ("vbd 'architect", 0.19585037846662595),
 ('nnp cc bella', 0.19145132234591059),
 ('he prp could', 0.1867061320635833),
 ('he prp even', 0.1867061320635833),
 ('jj nn prp', 0.18515772331206842),
 ('news nnp shia', 0.18515772331206842),
 ('nn nightmare', 0.18515772331206842),
 ('nn nightmare nn'