In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.feature import StopWordsRemover

from nltk.corpus import stopwords
import numpy as np
import pandas as pd

from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
import string
from pyspark.ml.feature import HashingTF, IDF, Word2Vec
from pyspark.mllib.linalg import Vectors
import timeit
import sys
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import NaiveBayes, LogisticRegression, DecisionTreeClassifier,RandomForestClassifier,LogisticRegressionModel
from pyspark.mllib.classification import SVMWithSGD, SVMModel

In [2]:
def remove_punctuation(sentence):
    punctuations = list(string.punctuation)
    extra_punctuations = ['.', '``', '...', '\'s', '--', '-', 'n\'t', '_', '–']
    punctuations += extra_punctuations
    filtered = [w for w in sentence.lower() if w not in punctuations]
    return ("".join(filtered)).split()

def clean_data(file):
    data = sc.textFile(file)
    col_rdd = data.map(lambda x: (x.split('\t')[0], x[-1]))
    punctuation_removed_rdd = col_rdd.map(lambda x: (remove_punctuation(x[0]), float(x[1])))
    data_df = sqlContext.createDataFrame(punctuation_removed_rdd, ["text", "label"])
    
    remover = StopWordsRemover(inputCol="text", outputCol="words", stopWords=stopwords.words('english'))
    a = remover.transform(data_df).select(["label", "words"])
    return a

def tf_idf(data_rdd_df):
    #data_rdd_df = data_rdd_df.toDF()  #use it when random split
    hashing_tf = HashingTF(inputCol="words", outputCol="tf_features")
    tf_data = hashing_tf.transform(data_rdd_df)

    idf_data = IDF(inputCol="tf_features", outputCol="features").fit(tf_data)
    tf_idf_data = idf_data.transform(tf_data)
    return tf_idf_data.select(["label", "words", "features"])


In [3]:
conf = SparkConf()
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)

In [4]:
#detectionMultil.txt : contains tweets in en, fr, gr 
# + sentences in these three languages (http://downloads.tatoeba.org/exports/sentences.tar.bz2)

#filtered_data_df = clean_data("data/detectionMultil.txt")
#training, test = filtered_data_df.rdd.randomSplit([0.7, 0.3], seed=0)

training = clean_data("data/detectionMultil.txt")
test = clean_data("data/detectionTest.txt")
print("Size of train dataset: ",training.count())
print("Size of test dataset: ",test.count())

train_df = tf_idf(training)
test_df = tf_idf(test)

#train the first time and save
lor = LogisticRegression(regParam=0.01)
start = timeit.default_timer()
model = lor.fit(train_df)
stop = timeit.default_timer()
print(' LR Time: ', stop - start)
# model.save("lr_LangModel.model")

#then just load predicted model
# model = LogisticRegressionModel.load("lr_LangModel.model")
#lor_predicted_df = model.transform(test_df).select(["label", "words", "prediction"])
lor_predicted_df = model.transform(test_df)

Size of train dataset:  160726
Size of test dataset:  101189
 LR Time:  25.300158759000624


In [5]:
accuracy = 1.0 * lor_predicted_df.filter(lor_predicted_df.label == lor_predicted_df.prediction).count() / lor_predicted_df.count()
print("Accuracy = ", accuracy)
print("------------")
greek = lor_predicted_df.filter((lor_predicted_df.label == 1.0) ).count()
french = lor_predicted_df.filter((lor_predicted_df.label == 2.0) ).count()
english = lor_predicted_df.filter((lor_predicted_df.label == 3.0) ).count()
print("Actual number of tweets")
print("greek: ", greek)
print("french: ", french)
print("english: ", english)
print("------------")

greek2 = lor_predicted_df.filter((lor_predicted_df.prediction == 1.0)).count()
french2 = lor_predicted_df.filter((lor_predicted_df.prediction == 2.0)).count()
english2 = lor_predicted_df.filter((lor_predicted_df.prediction == 3.0)).count()
print("Predicted number of tweets")
print("greek: ", greek2)
print("french: ", french2)
print("english: ", english2)
print("------------")

lost = lor_predicted_df.filter((lor_predicted_df.prediction != lor_predicted_df.label)).count()
print("Lost = ", lost)

Accuracy =  0.8915494767217781
------------
Actual number of tweets
greek:  33393
french:  33896
english:  33900
------------
Predicted number of tweets
greek:  35161
french:  25526
english:  40502
------------
Lost =  10974
