In [1]:
import findspark
findspark.init('/home/halil/anaconda/lib/python3.6/site-packages/pyspark')  #Anaconda spark location

In [2]:
#Control = http://localhost:4040/

from pyspark import SparkContext, SparkConf
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF,IDF
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel

import re
import string
import Stemmer


In [3]:
#Setting property

conf = SparkConf()
conf.setAppName('mllib')
conf.set("spark.driver.memory",'8g')
conf.set("spark.executor.memory", '8g')
conf.setMaster('local[8]')
sc = SparkContext(
    conf=conf)

In [4]:
#Preprocessing

def tokenize_stem(doc):

    cleaned_doc = remove_punctuation(doc)
    tokens = cleaned_doc.split()
    stemmed_tokens = []
   
    for token in tokens:
        stem_token = stemmer(token)
        stemmed_tokens.append(stem_token)

    return stemmed_tokens


def token_stem_merge(doc):
    tokenized_doc = tokenize_stem(doc)
    merged_token = ""

    for token in tokenized_doc:
         merged_token += token + " "

    return merged_token.strip()


def stemmer(words):
    
    stemmer = Stemmer.Stemmer('turkish')
    return stemmer.stemWord(words)


def remove_punctuation(doc):
    delete_chars = string.digits + string.punctuation
    doc = remove_html_tags(doc).lower()
    doc = str(doc).translate(str.maketrans('', '', delete_chars))\
            .replace("\r", " ").replace("\n", " ").replace("“", " ").replace("’", " ")\
            .replace("", " ").replace("", " ").replace("", " ").replace("", " ").replace("‘", " ")\
            .replace("…", " ").replace("'", " ")

    doc = " ".join(doc.split())
    return doc

def remove_html_tags(doc):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', doc)
    return cleantext


In [5]:
tokenize_stem("Gün geçmiyorki olaylar olmasın.")

['gün', 'geçmiyorki', 'olay', 'olma']

In [6]:
#Data load

dunya_list = []
ekonomi_list = []
genel_list = []
guncel_list = []
magazin_list = []
spor_list = []

rootdir = "16752news"

import os


# files are read end labeled
for directories, subdirs, files in os.walk(rootdir):
    if (os.path.split(directories)[1] == 'dunya'):
        for filename in files:
            with open(os.path.join(directories, filename), encoding="UTF-8") as f:
                data = f.read()
                data = tokenize_stem(data)
                dunya_list.append(data)  # added to list

    if (os.path.split(directories)[1] == 'ekonomi'):
        for filename in files:
            with open(os.path.join(directories, filename), encoding="UTF-8") as f:
                data = f.read()
                data = tokenize_stem(data)
                ekonomi_list.append(data)

    if (os.path.split(directories)[1] == 'genel'):
        for filename in files:
            with open(os.path.join(directories, filename), encoding="UTF-8") as f:
                data = f.read()
                data = tokenize_stem(data)
                genel_list.append(data)

    if (os.path.split(directories)[1] == 'guncel'):
        for filename in files:
            with open(os.path.join(directories, filename), encoding="UTF-8") as f:
                data = f.read()
                data = tokenize_stem(data)
                guncel_list.append(data)


    if (os.path.split(directories)[1] == 'magazin'):
        for filename in files:
            with open(os.path.join(directories, filename), encoding="UTF-8") as f:
                data = f.read()
                data = tokenize_stem(data)
                magazin_list.append(data)  # added to list


    if (os.path.split(directories)[1] == 'spor'):
        for filename in files:
            with open(os.path.join(directories, filename), encoding="UTF-8") as f:
                data = f.read()
                data = tokenize_stem(data)
                spor_list.append(data)


print(len(dunya_list))
print(len(ekonomi_list))
print(len(genel_list))
print(len(guncel_list))
print(len(magazin_list))
print(len(spor_list))

combined_list = []

combined_list = dunya_list + ekonomi_list + genel_list + guncel_list  + magazin_list  + spor_list
print("sum news :  ", len(combined_list))


dunya_bag = sc.parallelize(dunya_list)
ekonomi_bag = sc.parallelize(ekonomi_list)
genel_bag = sc.parallelize(genel_list)
guncel_bag = sc.parallelize(guncel_list)
magazin_bag = sc.parallelize(magazin_list)
spor_bag = sc.parallelize(spor_list)



2792
2792
2792
2792
2792
2792
sum news :   16752


In [7]:
print(dunya_bag.collect()[0])

['camero', 'kritik', 'ab', 'konuşmas', 'lider', 'olduk', 'muhafazakar', 'parti', 'iç', 'ab', 'ye', 'şüphe', 'baka', 'üye', 'taraf', 'uz', 'sür', 'baskı', 'alt', 'ola', 've', 'ab', 'ye', 'faz', 'taviz', 'vermek', 'eleştirile', 'davidi', 'camero', 'ın', 'te', 'yapılacak', 'genel', 'seç', 'kazanmas', 'hal', 'ülke', 'ab', 'üyelik', 'ilgil', 'referandı', 'gidilecek', 'söylemes', 'öngörülüyor', 'camero', 'ın', 'geçe', 'haf', 'holla', 'da', 'yapacak', 'konuşma', 'cezayir', 'dek', 'reh', 'kriz', 'neden', 'ertelenmiş', 'brükseldeki̇', 'yetki̇', 'azalacak', 'i̇ngilter', 'dek', 'koalisyo', 'hükümet', 'büyük', 'ortak', 'muhafazakar', 'parti', 'yasal', 'hak', 'adalet', 'siste', 'gip', 'konu', 'brüksel', 'dek', 'yetki', 'azaltmak', 'istiyor', 'davidi', 'camero', 'i̇ngilter', 'nin', 'ab', 'de', 'kalmas', 'gerektik', 'söylüyor', 'ancak', 'ülke', 'çıkar', 'brüksel', 'in', 'çıkar', 'ön', 'geldik', 'de', 'savunuyor', 'i̇ngilter', 'başbakan', 'konuşma', 'ab', 'de', 'önemli', 'bir', 'mesaj', 'olarak', 'alg

In [8]:
# TF Mapping
dunya_tf = HashingTF().transform(dunya_bag)
ekonomi_tf = HashingTF().transform(ekonomi_bag)
genel_tf = HashingTF().transform(genel_bag)
guncel_tf = HashingTF().transform(guncel_bag)
magazin_tf = HashingTF().transform(magazin_bag)
spor_tf = HashingTF().transform(spor_bag)


dunya_tf.cache()
ekonomi_tf.cache()
genel_tf.cache()
guncel_tf.cache()
magazin_tf.cache()
spor_tf.cache()


PythonRDD[11] at RDD at PythonRDD.scala:53

In [9]:
print(dunya_tf.collect()[0])

(1048576,[15703,18510,27992,35334,39791,55038,69342,69879,72205,83543,93623,95022,96263,114863,115023,128943,142051,150860,156099,177083,194923,197608,199186,206229,211902,230732,251270,259228,283821,288081,308718,317549,317637,318552,319905,337828,338407,343040,344240,364370,367631,372029,373547,381787,382299,386511,394354,395788,404338,422515,444233,446311,454801,458702,460864,462919,466544,471644,478747,481515,503686,511205,523349,523958,527028,549056,551106,566297,575751,576289,588182,601305,622156,623132,626926,629201,661793,668664,669938,678702,690821,692189,692384,705499,713323,718825,720705,736943,739088,758462,763908,766936,769017,774116,782300,782309,782526,785341,794909,795781,801756,802440,815331,841422,848023,864485,870710,873253,875517,876235,886421,892790,894973,904484,905319,912786,913532,914631,917145,917598,919187,933913,942083,945174,952491,953367,965406,972225,989250,991546,997716,1002434,1010688,1025814,1026602,1028489,1033214,1037770],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,

In [10]:
#IDF transformation
dunya_idf = IDF().fit(dunya_tf)
ekonomi_idf = IDF().fit(ekonomi_tf)
genel_idf = IDF().fit(genel_tf)
guncel_idf = IDF().fit(guncel_tf)
magazin_idf = IDF().fit(magazin_tf)
spor_idf = IDF().fit(spor_tf)

In [11]:
dunya_idf

<pyspark.mllib.feature.IDFModel at 0x7fa1e33364e0>

In [12]:
#Tf-idf
dunya_tfidf = dunya_idf.transform(dunya_tf)
ekonomi_tfidf = ekonomi_idf.transform(ekonomi_tf)
genel_tfidf = genel_idf.transform(genel_tf)
guncel_tfidf = guncel_idf.transform(guncel_tf)
magazin_tfidf = magazin_idf.transform(magazin_tf)
spor_tfidf = spor_idf.transform(spor_tf)



In [13]:
dunya_tfidf

MapPartitionsRDD[51] at mapPartitions at PythonMLLibAPI.scala:1336

In [14]:
print(dunya_tfidf.collect()[0])

(1048576,[15703,18510,27992,35334,39791,55038,69342,69879,72205,83543,93623,95022,96263,114863,115023,128943,142051,150860,156099,177083,194923,197608,199186,206229,211902,230732,251270,259228,283821,288081,308718,317549,317637,318552,319905,337828,338407,343040,344240,364370,367631,372029,373547,381787,382299,386511,394354,395788,404338,422515,444233,446311,454801,458702,460864,462919,466544,471644,478747,481515,503686,511205,523349,523958,527028,549056,551106,566297,575751,576289,588182,601305,622156,623132,626926,629201,661793,668664,669938,678702,690821,692189,692384,705499,713323,718825,720705,736943,739088,758462,763908,766936,769017,774116,782300,782309,782526,785341,794909,795781,801756,802440,815331,841422,848023,864485,870710,873253,875517,876235,886421,892790,894973,904484,905319,912786,913532,914631,917145,917598,919187,933913,942083,945174,952491,953367,965406,972225,989250,991546,997716,1002434,1010688,1025814,1026602,1028489,1033214,1037770],[4.3239536533,3.23439120015,4

In [15]:
#Label 1,2,3,4,5 and 6

dunya_tfidf_label = dunya_tfidf.map(lambda x: LabeledPoint('1', x))
ekonomi_tfidf_label = ekonomi_tfidf.map(lambda x: LabeledPoint('2', x))
genel_tfidf_label = genel_tfidf.map(lambda x: LabeledPoint('3', x))
guncel_tfidf_label = guncel_tfidf.map(lambda x: LabeledPoint('4', x))
magazin_tfidf_label = magazin_tfidf.map(lambda x: LabeledPoint('5', x))
spor_tfidf_label = spor_tfidf.map(lambda x: LabeledPoint('6', x))

In [16]:
dunya_tfidf_label

PythonRDD[72] at RDD at PythonRDD.scala:53

In [17]:
dunya_tfidf_label.collect()[0]

LabeledPoint(1.0, (1048576,[15703,18510,27992,35334,39791,55038,69342,69879,72205,83543,93623,95022,96263,114863,115023,128943,142051,150860,156099,177083,194923,197608,199186,206229,211902,230732,251270,259228,283821,288081,308718,317549,317637,318552,319905,337828,338407,343040,344240,364370,367631,372029,373547,381787,382299,386511,394354,395788,404338,422515,444233,446311,454801,458702,460864,462919,466544,471644,478747,481515,503686,511205,523349,523958,527028,549056,551106,566297,575751,576289,588182,601305,622156,623132,626926,629201,661793,668664,669938,678702,690821,692189,692384,705499,713323,718825,720705,736943,739088,758462,763908,766936,769017,774116,782300,782309,782526,785341,794909,795781,801756,802440,815331,841422,848023,864485,870710,873253,875517,876235,886421,892790,894973,904484,905319,912786,913532,914631,917145,917598,919187,933913,942083,945174,952491,953367,965406,972225,989250,991546,997716,1002434,1010688,1025814,1026602,1028489,1033214,1037770],[4.32395365

In [18]:
#Split news data 80/20 into training and test data sets
dtrain, dtest = dunya_tfidf_label.randomSplit([0.8, 0.2],seed=0)
etrain, etest = ekonomi_tfidf_label.randomSplit([0.8, 0.2],seed = 0)
gtrain, gtest = genel_tfidf_label.randomSplit([0.8, 0.2],seed = 0)
gutrain, gutest = guncel_tfidf_label.randomSplit([0.8, 0.2],seed = 0)
mtrain, mtest = magazin_tfidf_label.randomSplit([0.8, 0.2],seed = 0)
strain, stest = spor_tfidf_label.randomSplit([0.8, 0.2],seed = 0)



train1 = dtrain.union(etrain)
train2 = train1.union(gtrain)
train3 = train2.union(gutrain)
train4 = train3.union(mtrain)
trainh = train4.union(strain)



test1=dtest.union(etest)
test2=test1.union(gtest)
test3=test2.union(gutest)
test4=test3.union(mtest)
testh=test4.union(stest)




trainh.cache()
testh.cache()

UnionRDD[94] at union at NativeMethodAccessorImpl.java:0

In [19]:
latest_train = trainh.union(testh)

In [20]:
latest_train.collect()[0]

LabeledPoint(1.0, (1048576,[15703,18510,27992,35334,39791,55038,69342,69879,72205,83543,93623,95022,96263,114863,115023,128943,142051,150860,156099,177083,194923,197608,199186,206229,211902,230732,251270,259228,283821,288081,308718,317549,317637,318552,319905,337828,338407,343040,344240,364370,367631,372029,373547,381787,382299,386511,394354,395788,404338,422515,444233,446311,454801,458702,460864,462919,466544,471644,478747,481515,503686,511205,523349,523958,527028,549056,551106,566297,575751,576289,588182,601305,622156,623132,626926,629201,661793,668664,669938,678702,690821,692189,692384,705499,713323,718825,720705,736943,739088,758462,763908,766936,769017,774116,782300,782309,782526,785341,794909,795781,801756,802440,815331,841422,848023,864485,870710,873253,875517,876235,886421,892790,894973,904484,905319,912786,913532,914631,917145,917598,919187,933913,942083,945174,952491,953367,965406,972225,989250,991546,997716,1002434,1010688,1025814,1026602,1028489,1033214,1037770],[4.32395365

In [21]:
train_data, test_data = latest_train.randomSplit([0.8, 0.2],seed=0)

#Train a naive Bayes model.
model = NaiveBayes.train(train_data, 1.0)

#Make prediction and test accuracy.
predictionAndLabel = test_data.map(lambda p: (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda pl: pl[0] == pl[1]).count() / test_data.count()
print('model accuracy {}'.format(accuracy))



model accuracy 0.6965925925925925
