In [1]:
import pandas as pd
import numpy as np
import gensim
import datetime
import re
import os
import logging
import time
from h2o.estimators.word2vec import H2OWord2vecEstimator
from h2o.estimators import H2OGradientBoostingEstimator
from h2o.estimators import H2ORandomForestEstimator
import h2o
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/toprak.ucar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
csv_files = [pos_csv for pos_csv in os.listdir("data/") if pos_csv.endswith('.csv')]
print(csv_files)
df = pd.DataFrame()

for file in csv_files:
    df = df.append(pd.read_csv("data/" + file))

['articles1.csv', 'articles3.csv', 'articles2.csv']


In [3]:
df.columns

Index(['Unnamed: 0', 'id', 'title', 'publication', 'author', 'date', 'year',
       'month', 'url', 'content'],
      dtype='object')

In [4]:
df = df.drop(['author', 'date', 'year', 'month', 'url'], axis=1)

In [5]:
df['title'] = df['title'].str.lower()

In [6]:
df['publication'] = df['publication'].str.lower()

In [7]:
df['content'] = df['content'].str.lower()

In [8]:
df['content'] = df['content'].str.replace('[^a-zA-Zğüşçö]', ' ')

In [9]:
stop_words = stopwords.words('english')

In [10]:
df['content'] = df['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [11]:
df.groupby(['publication']).count()

Unnamed: 0_level_0,Unnamed: 0,id,title,content
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
atlantic,7179,7179,7179,7179
breitbart,23781,23781,23781,23781
business insider,6757,6757,6757,6757
buzzfeed news,4854,4854,4854,4854
cnn,11488,11488,11488,11488
fox news,4354,4354,4354,4354
guardian,8681,8681,8681,8681
national review,6203,6203,6203,6203
new york post,17493,17493,17493,17493
new york times,7803,7803,7803,7803


In [12]:
len(df)

142570

In [13]:
df["isBreitbart"] = np.where(df['publication'] == 'breitbart', "1", "0")

In [14]:
words = []
for count in range (len(df)):
    if (type(df.iloc[count]['content']) != float):
        words.append(df.iloc[count]['content'].split())

In [15]:
model = gensim.models.Word2Vec(words, windoaw=50, 
                                       size= 1000, iter=5, 
                                       min_count=3, workers = 4)

In [16]:
model.wv.similar_by_word("istanbul")

  if np.issubdtype(vec.dtype, np.int):


[('bosporus', 0.6041005849838257),
 ('reina', 0.5746868848800659),
 ('bosphorus', 0.573968231678009),
 ('ataturk', 0.5490278005599976),
 ('mehmet', 0.5461439490318298),
 ('ankara', 0.5452790260314941),
 ('hurriyet', 0.5435633659362793),
 ('turkish', 0.5372835993766785),
 ('pamuk', 0.5340129137039185),
 ('istiklal', 0.5253186225891113)]

In [17]:
model.wv.similar_by_word("messi")

  if np.issubdtype(vec.dtype, np.int):


[('neymar', 0.6838817596435547),
 ('ronaldo', 0.6767688989639282),
 ('barcelona', 0.6438531279563904),
 ('goalkeeper', 0.6131119728088379),
 ('cristiano', 0.6047729253768921),
 ('sevilla', 0.5975548028945923),
 ('barca', 0.5932743549346924),
 ('griezmann', 0.5716344118118286),
 ('piqu', 0.5712944269180298),
 ('striker', 0.5618128776550293)]

In [18]:
model.wv.similar_by_word("ataturk")

  if np.issubdtype(vec.dtype, np.int):


[('kemal', 0.5918941497802734),
 ('atatürk', 0.5512984991073608),
 ('istanbul', 0.5490278005599976),
 ('coups', 0.49630677700042725),
 ('ottoman', 0.4951777160167694),
 ('akp', 0.48499637842178345),
 ('erdoğan', 0.4355770945549011),
 ('mustafa', 0.4154052734375),
 ('ankara', 0.41123801469802856),
 ('turkey', 0.41080719232559204)]

In [19]:
model.wv.most_similar(positive=['paris', 'turkey'], negative=['france'])

  if np.issubdtype(vec.dtype, np.int):


[('ankara', 0.45684361457824707),
 ('turkish', 0.4347909688949585),
 ('istanbul', 0.3412559926509857),
 ('turks', 0.31825536489486694),
 ('incirlik', 0.2970038950443268),
 ('lira', 0.2946658432483673),
 ('ahmet', 0.28686249256134033),
 ('recep', 0.2867054343223572),
 ('izmir', 0.28641149401664734),
 ('erdogan', 0.2806168496608734)]

In [20]:
model.wv.most_similar(positive=['trump', 'turkey'], negative=['america'])

  if np.issubdtype(vec.dtype, np.int):


[('turkish', 0.4010379910469055),
 ('ankara', 0.388433575630188),
 ('erdoğan', 0.3822430968284607),
 ('erdogan', 0.3731622099876404),
 ('yildirim', 0.3409467041492462),
 ('davutoglu', 0.3317990005016327),
 ('turks', 0.29959139227867126),
 ('recep', 0.286358118057251),
 ('lira', 0.27761685848236084),
 ('hurriyet', 0.27162522077560425)]

In [21]:
model.wv.most_similar(positive=['messi', 'madrid'], negative=['barcelona'])

  if np.issubdtype(vec.dtype, np.int):


[('ronaldo', 0.6099832057952881),
 ('cristiano', 0.5582559704780579),
 ('atl', 0.5154239535331726),
 ('atletico', 0.5150270462036133),
 ('neymar', 0.5147936344146729),
 ('sevilla', 0.5030616521835327),
 ('tico', 0.5014896988868713),
 ('lionel', 0.49445661902427673),
 ('isco', 0.48357024788856506),
 ('maradona', 0.47977331280708313)]

In [23]:
model.wv.most_similar(positive=['paris', 'spain'], negative=['france'])

  if np.issubdtype(vec.dtype, np.int):


[('madrid', 0.4317438304424286),
 ('barcelona', 0.38237810134887695),
 ('milan', 0.36274564266204834),
 ('spaniards', 0.33720114827156067),
 ('bogot', 0.33717870712280273),
 ('spanish', 0.3279109001159668),
 ('roberto', 0.324995219707489),
 ('jos', 0.3247702419757843),
 ('ignacio', 0.32421284914016724),
 ('claudio', 0.31243082880973816)]

In [24]:
model.save(fname_or_handle="model/" + "model_for_news")