In [1]:
import pandas as pd
import numpy as np
import gensim
import re
import os
import logging
from h2o.estimators.word2vec import H2OWord2vecEstimator
from h2o.estimators import H2OGradientBoostingEstimator
from h2o.estimators import H2ORandomForestEstimator
import h2o
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/toprak.ucar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
csv_files = [pos_csv for pos_csv in os.listdir("data/") if pos_csv.endswith('.csv')]
print(csv_files)
df = pd.DataFrame()

for file in csv_files:
    df = df.append(pd.read_csv("data/" + file))

['articles1.csv', 'articles3.csv', 'articles2.csv']


In [3]:
df.columns

Index(['Unnamed: 0', 'id', 'title', 'publication', 'author', 'date', 'year',
       'month', 'url', 'content'],
      dtype='object')

In [4]:
df = df.drop(['author', 'date', 'year', 'month', 'url'], axis=1)

In [5]:
df['title'] = df['title'].str.lower()

In [6]:
df['publication'] = df['publication'].str.lower()

In [7]:
df['content'] = df['content'].str.lower()

In [8]:
df['content'] = df['content'].str.replace('[^a-zA-Z]', ' ')

In [9]:
stop_words = stopwords.words('english')

In [10]:
df['content'] = df['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [11]:
df.groupby(['publication']).count()

Unnamed: 0_level_0,Unnamed: 0,id,title,content
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
atlantic,7179,7179,7179,7179
breitbart,23781,23781,23781,23781
business insider,6757,6757,6757,6757
buzzfeed news,4854,4854,4854,4854
cnn,11488,11488,11488,11488
fox news,4354,4354,4354,4354
guardian,8681,8681,8681,8681
national review,6203,6203,6203,6203
new york post,17493,17493,17493,17493
new york times,7803,7803,7803,7803


In [12]:
len(df)

142570

In [13]:
df["isBreitbart"] = np.where(df['publication'] == 'breitbart', "1", "0")

In [14]:
words = []
for count in range (len(df)):
    if (type(df.iloc[count]['content']) != float):
        words.append(df.iloc[count]['content'].split())

In [15]:
model = gensim.models.Word2Vec(words, window=15, 
                                       size= 50, iter=10, 
                                       min_count=1, workers = 4)



In [16]:
model.wv.similar_by_word("istanbul")

  if np.issubdtype(vec.dtype, np.int):


[('ataturk', 0.7852783203125),
 ('bosporus', 0.7758889198303223),
 ('ankara', 0.7601418495178223),
 ('ortakoy', 0.7480058073997498),
 ('kocarslan', 0.7401304841041565),
 ('hurriyet', 0.7389910221099854),
 ('gaziantep', 0.7354176044464111),
 ('izmir', 0.7345712184906006),
 ('leyman', 0.7324717044830322),
 ('turkish', 0.7288885712623596)]

In [17]:
model.wv.similar_by_word("messi")

  if np.issubdtype(vec.dtype, np.int):


[('ronaldo', 0.9083605408668518),
 ('neymar', 0.8756331205368042),
 ('barcelona', 0.84525066614151),
 ('higuain', 0.8399900197982788),
 ('striker', 0.839557409286499),
 ('atletico', 0.836921215057373),
 ('cristiano', 0.8350076675415039),
 ('goalkeeper', 0.8214976191520691),
 ('juve', 0.8200994729995728),
 ('isco', 0.8191418051719666)]

In [18]:
model.wv.similar_by_word("ataturk")

  if np.issubdtype(vec.dtype, np.int):


[('atat', 0.8466954827308655),
 ('kemal', 0.8030822277069092),
 ('istanbul', 0.7852783203125),
 ('turkishminutetm', 0.7809292674064636),
 ('hurriyet', 0.7457811832427979),
 ('turkish', 0.7173997759819031),
 ('erdo', 0.7094042301177979),
 ('yeni', 0.7084641456604004),
 ('erdogan', 0.7062608599662781),
 ('kizilay', 0.702795147895813)]

In [19]:
model.wv.most_similar(positive=['paris', 'turkey'], negative=['france'])

  if np.issubdtype(vec.dtype, np.int):


[('ankara', 0.6917442679405212),
 ('turkish', 0.6719129085540771),
 ('syria', 0.6188807487487793),
 ('syrian', 0.6121764779090881),
 ('tehran', 0.6115056276321411),
 ('turks', 0.6065649390220642),
 ('erdogan', 0.6037282943725586),
 ('damascus', 0.6023814082145691),
 ('tayyip', 0.5860105752944946),
 ('akinci', 0.585946261882782)]

In [20]:
model.wv.most_similar(positive=['trump', 'turkey'], negative=['america'])

  if np.issubdtype(vec.dtype, np.int):


[('turkish', 0.6825993657112122),
 ('erdogan', 0.6465330719947815),
 ('davutoglu', 0.6372458934783936),
 ('yildirim', 0.6269304752349854),
 ('marashipov', 0.622066855430603),
 ('ankara', 0.6176018714904785),
 ('erdo', 0.6105588674545288),
 ('davuto', 0.5798016786575317),
 ('kremlin', 0.5581178665161133),
 ('reproaches', 0.5468676090240479)]

In [21]:
model.wv.most_similar(positive=['messi', 'madrid'], negative=['barcelona'])

  if np.issubdtype(vec.dtype, np.int):


[('ronaldo', 0.8427572250366211),
 ('cristiano', 0.8336759805679321),
 ('atletico', 0.8255482316017151),
 ('neymar', 0.8068755269050598),
 ('higuain', 0.795189380645752),
 ('striker', 0.7914223074913025),
 ('zidane', 0.7898510694503784),
 ('atl', 0.7897554039955139),
 ('isco', 0.7802573442459106),
 ('benzema', 0.7757295966148376)]

In [22]:
model.wv.most_similar(positive=['louvre', 'italy'], negative=['france'])

  if np.issubdtype(vec.dtype, np.int):


[('seine', 0.7563114166259766),
 ('rueger', 0.697585940361023),
 ('vermeer', 0.6901447772979736),
 ('ingres', 0.6818567514419556),
 ('frescoes', 0.6806928515434265),
 ('porto', 0.671973705291748),
 ('hatton', 0.6668001413345337),
 ('parthenon', 0.6627079248428345),
 ('milan', 0.6609148979187012),
 ('prizren', 0.6590593457221985)]

In [23]:
model.save(fname_or_handle="model/" + "model_for_news")

In [24]:
def calculate_avg_vecs(column):
    words = getattr(column, 'content').split()
    vecs = []
    for word in words:
        vecs.append(model[word])
    if len(vecs) == 0:
        return np.zeros(50)
    vec_sum = np.sum(vecs, axis = 0)
    vec_avg = np.divide(vec_sum, len(vecs))
    return vec_avg

In [25]:
def calculate_text_length_by_words(text):
    return len(text.split())

In [26]:
def calculate_text_length_by_chars(text):
    return len(text)

In [27]:
def generate_publisher_dictionary(df):
    publication_array = df['publication'].unique()
    publication_index = np.arange(len(publication_array))
    return dict(zip(publication_array, publication_index))

In [28]:
vec_col_names = []
for x in range(50):
    vec_col_names.append('v' + str(x))

In [29]:
def generate_columns_for_model(df):
    feature_columns = pd.DataFrame()
    for row in df.itertuples():
        if(type(getattr(row, 'content')) == str):
            vecs = calculate_avg_vecs(row)
            vec_df = pd.DataFrame([vecs.tolist()], columns=vec_col_names)
            column = [getattr(row, 'content')]
            col_df = pd.DataFrame([column], columns=['content'])
            col_df['number_of_words'] = calculate_text_length_by_words(getattr(row, 'content'))
            col_df['len_of_text'] = calculate_text_length_by_chars(getattr(row, 'content'))
            col_df['publication'] = publisher_dict.get(getattr(row, 'publication'))
            feature_df = pd.concat([col_df, vec_df], axis=1)
            feature_columns = feature_columns.append(feature_df)
    return feature_columns

In [None]:
publisher_dict = generate_publisher_dictionary(df)
feature_columns = generate_columns_for_model(df)

  """


In [None]:
feature_columns["isBreitbart"] = np.where(feature_columns['publication'] == 1, "1", "0")

In [None]:
training = feature_columns.groupby('isBreitbart').apply(lambda x : x.sample(frac = 0.7))
test = pd.concat([training,feature_columns]).drop_duplicates(subset='content',keep=False)       

In [None]:
h2o.init(max_mem_size="10G")
h2o.connect()

In [None]:
training_frame = h2o.H2OFrame(training)
test_frame =  h2o.H2OFrame(test)
hf = h2o.H2OFrame(test)
test_hf, validation_hf = hf.split_frame(ratios=[0.5])

In [None]:
training_frame['isBreitbart'] = training_frame['isBreitbart'].asfactor()
test_frame['isBreitbart'] = test_frame['isBreitbart'].asfactor()

In [None]:
vec_col_names.extend(['number_of_words', 'len_of_text'])
predictors = vec_col_names
response = 'isBreitbart'

In [None]:
rf_model = H2ORandomForestEstimator(model_id="rf_model",
                                            ntrees=20, 
                                            max_depth=10, 
                                            nfolds=10)

In [None]:
rf_model.train(x=predictors, 
                       y=response, 
                       training_frame=training_frame, 
                       validation_frame=validation_hf)

In [None]:
h2o.save_model(model=rf_model,
                      path='model/model_name',
                      force=True)

In [None]:
performance = rf_model.model_performance(test_data=test_hf)

In [None]:
performance

In [None]:
feature_columns.to_csv("feature_columns.csv")

In [None]:
gradient_boosting_estimator = H2OGradientBoostingEstimator(
    stopping_metric = 'AUC',
    stopping_tolerance = 0.001,
    stopping_rounds = 5,
    score_tree_interval = 10,
    model_id="id",
    seed=2000000
)

In [None]:
gradient_boosting_estimator.train(predictors, response, training_frame=training_frame, validation_frame=validation_hf)
 

In [None]:
gradient_boosting_estimator.confusion_matrix(valid=True)

In [None]:
performance = gradient_boosting_estimator.model_performance(test_data=test_hf)
print(performance)