In [1]:
import pandas as pd
import numpy as np
import gensim
import datetime
import re
import os
import logging
import time
from h2o.estimators.word2vec import H2OWord2vecEstimator
from h2o.estimators import H2OGradientBoostingEstimator
from h2o.estimators import H2ORandomForestEstimator
import h2o
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/toprak.ucar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
csv_files = [pos_csv for pos_csv in os.listdir("data/") if pos_csv.endswith('.csv')]
print(csv_files)
df = pd.DataFrame()

for file in csv_files:
    df = df.append(pd.read_csv("data/" + file))

['articles1.csv', 'articles3.csv', 'articles2.csv']


In [3]:
df.columns

Index(['Unnamed: 0', 'id', 'title', 'publication', 'author', 'date', 'year',
       'month', 'url', 'content'],
      dtype='object')

In [4]:
df = df.drop(['author', 'date', 'year', 'month', 'url'], axis=1)

In [5]:
df['title'] = df['title'].str.lower()

In [6]:
df['publication'] = df['publication'].str.lower()

In [7]:
df['content'] = df['content'].str.lower()

In [8]:
df['content'] = df['content'].str.replace('[^a-zA-Zğüşçö]', ' ')

In [9]:
stop_words = stopwords.words('english')

In [10]:
df['content'] = df['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [11]:
df.groupby(['publication']).count()

Unnamed: 0_level_0,Unnamed: 0,id,title,content
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
atlantic,7179,7179,7179,7179
breitbart,23781,23781,23781,23781
business insider,6757,6757,6757,6757
buzzfeed news,4854,4854,4854,4854
cnn,11488,11488,11488,11488
fox news,4354,4354,4354,4354
guardian,8681,8681,8681,8681
national review,6203,6203,6203,6203
new york post,17493,17493,17493,17493
new york times,7803,7803,7803,7803


In [12]:
len(df)

142570

In [13]:
df["isBreitbart"] = np.where(df['publication'] == 'breitbart', "1", "0")

In [14]:
words = []
for count in range (len(df)):
    if (type(df.iloc[count]['content']) != float):
        words.append(df.iloc[count]['content'].split())

In [15]:
model = gensim.models.Word2Vec(words, window=15, 
                                       size= 50, iter=10, 
                                       min_count=1, workers = 4)



In [16]:
model.wv.similar_by_word("istanbul")

  if np.issubdtype(vec.dtype, np.int):


[('ataturk', 0.7798241376876831),
 ('bosporus', 0.7557799220085144),
 ('reina', 0.7474915981292725),
 ('ankara', 0.7474610805511475),
 ('ahmet', 0.7405011653900146),
 ('gaziantep', 0.7166070938110352),
 ('ortakoy', 0.7148107290267944),
 ('hurriyet', 0.7147783637046814),
 ('sultanahmet', 0.7122244834899902),
 ('atatürk', 0.7095097303390503)]

In [17]:
model.wv.similar_by_word("messi")

  if np.issubdtype(vec.dtype, np.int):


[('ronaldo', 0.8949919939041138),
 ('neymar', 0.8825325965881348),
 ('barça', 0.8726922273635864),
 ('higuain', 0.8568639755249023),
 ('iniesta', 0.8435724377632141),
 ('sevilla', 0.8430395126342773),
 ('barcelona', 0.8402814865112305),
 ('griezmann', 0.833495557308197),
 ('juve', 0.8228360414505005),
 ('atletico', 0.8223202228546143)]

In [18]:
model.wv.similar_by_word("ataturk")

  if np.issubdtype(vec.dtype, np.int):


[('atatürk', 0.8904459476470947),
 ('kemal', 0.8044395446777344),
 ('istanbul', 0.7798240184783936),
 ('yildiz', 0.736945629119873),
 ('havalimani', 0.7224811315536499),
 ('hurriyet', 0.7168923616409302),
 ('ahmet', 0.7138577699661255),
 ('turkishminutetm', 0.7051156759262085),
 ('erdemir', 0.693200945854187),
 ('reza', 0.6862856149673462)]

In [19]:
model.wv.most_similar(positive=['paris', 'turkey'], negative=['france'])

  if np.issubdtype(vec.dtype, np.int):


[('ankara', 0.7369124293327332),
 ('turkish', 0.6831108927726746),
 ('davutoglu', 0.6586964130401611),
 ('turks', 0.6429847478866577),
 ('doha', 0.6328001022338867),
 ('erdogan', 0.6145455241203308),
 ('syria', 0.6067423224449158),
 ('qataris', 0.6024590730667114),
 ('syrian', 0.5936243534088135),
 ('uae', 0.5925226807594299)]

In [20]:
model.wv.most_similar(positive=['trump', 'turkey'], negative=['america'])

  if np.issubdtype(vec.dtype, np.int):


[('erdogan', 0.6578723192214966),
 ('turkish', 0.6578384041786194),
 ('davutoglu', 0.6330110430717468),
 ('yildirim', 0.6259006261825562),
 ('erdoğan', 0.6233898997306824),
 ('flynn', 0.605996310710907),
 ('duaa', 0.5999085307121277),
 ('ankara', 0.5847311019897461),
 ('plotters', 0.5842882394790649),
 ('lewandowski', 0.5586950778961182)]

In [21]:
model.wv.most_similar(positive=['messi', 'madrid'], negative=['barcelona'])

  if np.issubdtype(vec.dtype, np.int):


[('ronaldo', 0.8423516154289246),
 ('cristiano', 0.8305746912956238),
 ('higuain', 0.829215407371521),
 ('atletico', 0.8175734281539917),
 ('neymar', 0.8093371391296387),
 ('barça', 0.8056430816650391),
 ('iniesta', 0.8003312349319458),
 ('benzema', 0.79571533203125),
 ('atl', 0.7924468517303467),
 ('juve', 0.7840198278427124)]

In [22]:
model.wv.most_similar(positive=['louvre', 'italy'], negative=['france'])

  if np.issubdtype(vec.dtype, np.int):


[('seine', 0.7342692017555237),
 ('uffizi', 0.7259130477905273),
 ('porto', 0.6985342502593994),
 ('vermeer', 0.6922858953475952),
 ('artworks', 0.6869853138923645),
 ('ingres', 0.6850420236587524),
 ('paintings', 0.680021345615387),
 ('rueger', 0.6736742854118347),
 ('agadir', 0.6708642244338989),
 ('restorers', 0.6657865047454834)]

In [23]:
model.wv.most_similar(positive=['paris', 'spain'], negative=['france'])

  if np.issubdtype(vec.dtype, np.int):


[('montreal', 0.7173881530761719),
 ('marrakesh', 0.7167702913284302),
 ('punta', 0.6869816780090332),
 ('basel', 0.686701774597168),
 ('milan', 0.6821087598800659),
 ('lausanne', 0.6813076734542847),
 ('walloons', 0.663448691368103),
 ('switzerland', 0.6631225347518921),
 ('madrid', 0.6612542867660522),
 ('prague', 0.659068763256073)]

In [24]:
model.save(fname_or_handle="model/" + "model_for_news")

In [25]:
def calculate_avg_vecs(column):
    words = column.split()
    vecs = []
    for word in words:
        vecs.append(model[word])
    if len(vecs) == 0:
        return np.zeros(50)
    vec_sum = np.sum(vecs, axis = 0)
    vec_avg = np.divide(vec_sum, len(vecs))
    return vec_avg

In [26]:
def calculate_text_length_by_words(text):
    return len(text.split())

In [27]:
def calculate_text_length_by_chars(text):
    return len(text)

In [28]:
def generate_publisher_dictionary(df):
    publication_array = df['publication'].unique()
    publication_index = np.arange(len(publication_array))
    return dict(zip(publication_array, publication_index))

In [29]:
vec_col_names = []
for x in range(50):
    vec_col_names.append('v' + str(x))

In [30]:
def generate_columns_for_model(df):
    feature_columns = pd.DataFrame()
    feature_columns['len_of_text'] = df.apply(lambda x: len(x['content']), axis=1)
    feature_columns['number_of_words'] = df.apply(lambda x: len(x['content'].split()), axis=1)
    feature_columns['content'] = df['content']
    feature_columns['publication'] = df['publication']
    feature_columns = feature_columns.reset_index()
    vecs = df['content'].apply(lambda x: calculate_avg_vecs(x))
    vecs_df = pd.DataFrame(vecs.values)[0].apply(pd.Series)
    vecs_df = vecs_df.reset_index()
    return pd.concat([feature_columns.set_index('index'),vecs_df.set_index('index')], axis=1, join='inner')

In [31]:
feature_columns = generate_columns_for_model(df)

  """


In [32]:
for i in range (50):
    feature_columns.rename(columns={i: vec_col_names[i]}, inplace=True)

In [44]:
start = datetime.datetime.now()
feature_columns["isBreitbart"] = np.where(feature_columns['publication'] == "breitbart", "1", "0")
end = datetime.datetime.now()
print(end-start)

0:00:00.013233


In [46]:
training = feature_columns.groupby('isBreitbart').apply(lambda x : x.sample(frac = 0.7))
test = pd.concat([training,feature_columns]).drop_duplicates(subset='content',keep=False)       

In [35]:
h2o.init(max_mem_size="10G")
h2o.connect()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,2 days 12 hours 10 mins
H2O cluster timezone:,Europe/Istanbul
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.0.1
H2O cluster version age:,2 months and 7 days
H2O cluster name:,H2O_from_python_toprak_ucar_dfp8qy
H2O cluster total nodes:,1
H2O cluster free memory:,4.695 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


Connecting to H2O server at http://localhost:54321... successful.


0,1
H2O cluster uptime:,2 days 12 hours 10 mins
H2O cluster timezone:,Europe/Istanbul
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.0.1
H2O cluster version age:,2 months and 7 days
H2O cluster name:,H2O_from_python_toprak_ucar_dfp8qy
H2O cluster total nodes:,1
H2O cluster free memory:,4.695 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


<H2OConnection to http://localhost:54321, no session>

In [47]:
training_frame = h2o.H2OFrame(training)
test_frame =  h2o.H2OFrame(test)
hf = h2o.H2OFrame(test)
hf['isBreitbart'] = hf['isBreitbart'].asfactor()
test_hf, validation_hf = hf.split_frame(ratios=[0.5])

  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [48]:
training_frame['isBreitbart'] = training_frame['isBreitbart'].asfactor()
test_frame['isBreitbart'] = test_frame['isBreitbart'].asfactor()


In [49]:
vec_col_names.extend(['number_of_words', 'len_of_text'])
predictors = vec_col_names
response = 'isBreitbart'

In [50]:
rf_model = H2ORandomForestEstimator(model_id="rf_model",
                                            ntrees=20, 
                                            max_depth=10, 
                                            nfolds=10)

In [51]:
rf_model.train(x=predictors, 
                       y=response, 
                       training_frame=training_frame, 
                       validation_frame=validation_hf)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [None]:
h2o.save_model(model=rf_model,
                      path='model/model_name',
                      force=True)

In [52]:
performance = rf_model.model_performance(test_data=test_hf)

In [53]:
performance


ModelMetricsBinomial: drf
** Reported on test data. **

MSE: 0.11906473605556668
RMSE: 0.34505758368070494
LogLoss: 0.3814699746568444
Mean Per-Class Error: 0.292509319180112
AUC: 0.7769438345198754
pr_auc: 0.39430786040005944
Gini: 0.5538876690397507
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.23394455186584415: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,14645.0,3176.0,0.1782,(3176.0/17821.0)
1,1621.0,1900.0,0.4604,(1621.0/3521.0)
Total,16266.0,5076.0,0.2248,(4797.0/21342.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2339446,0.4420147,180.0
max f2,0.1291216,0.5971396,276.0
max f0point5,0.3057509,0.4180086,130.0
max accuracy,0.4643509,0.8390966,47.0
max precision,0.6824791,1.0,0.0
max recall,0.0109730,1.0,397.0
max specificity,0.6824791,1.0,0.0
max absolute_mcc,0.2339446,0.3150571,180.0
max min_per_class_accuracy,0.1788232,0.6988384,227.0


Gains/Lift Table: Avg response rate: 16.50 %, avg score: 16.68 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100272,0.5062228,3.8803945,3.8803945,0.6401869,0.5532967,0.6401869,0.5532967,0.0389094,0.0389094,288.0394535,288.0394535
,2,0.0200075,0.4595200,3.2725578,3.5771879,0.5399061,0.4813018,0.5901639,0.5173836,0.0326612,0.0715706,227.2557812,257.7187926
,3,0.0300347,0.4283629,2.8890529,3.3474517,0.4766355,0.4430735,0.5522621,0.4925749,0.0289690,0.1005396,188.9052866,234.7451728
,4,0.0400150,0.4076473,3.1302727,3.2932841,0.5164319,0.4180118,0.5433255,0.4739778,0.0312411,0.1317807,213.0272690,229.3284123
,5,0.0500422,0.3896203,2.5774883,3.1498569,0.4252336,0.3987704,0.5196629,0.4589081,0.0258449,0.1576257,157.7488341,214.9856878
,6,0.1000375,0.3312438,2.4086324,2.7794182,0.3973758,0.3581368,0.4585480,0.4085461,0.1204203,0.2780460,140.8632420,177.9418238
,7,0.1500328,0.2882296,2.1416378,2.5668911,0.3533271,0.3090047,0.4234853,0.3753760,0.1070719,0.3851179,114.1637789,156.6891149
,8,0.2000281,0.2542406,1.8576009,2.3896101,0.3064667,0.2710162,0.3942375,0.3492921,0.0928713,0.4779892,85.7600947,138.9610136
,9,0.3000187,0.2036983,1.4145035,2.0646253,0.2333646,0.2274077,0.3406216,0.3086703,0.1414371,0.6194263,41.4503473,106.4625344







In [None]:
feature_columns.to_csv("feature_columns.csv")

In [None]:
gradient_boosting_estimator = H2OGradientBoostingEstimator(
    stopping_metric = 'AUC',
    stopping_tolerance = 0.001,
    stopping_rounds = 5,
    score_tree_interval = 10,
    model_id="id",
    seed=2000000
)

In [None]:
gradient_boosting_estimator.train(predictors, response, training_frame=training_frame, validation_frame=validation_hf)
 

In [None]:
gradient_boosting_estimator.confusion_matrix(valid=True)

In [None]:
performance = gradient_boosting_estimator.model_performance(test_data=test_hf)
print(performance)

In [None]:
test = generate_columns_for_model(df)

In [45]:
feature_columns.isBreitbart.unique()

array(['0', '1'], dtype=object)