In [1]:
import pyspark 
import csv
from pyspark.sql import SQLContext, SparkSession
import pandas as pd
from pandas import DataFrame
import re
import numpy as np
import nltk
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.tokenize import word_tokenize # or use some other tokenizer
from pyspark.ml.feature import Tokenizer, HashingTF
from pyspark.sql import Row

In [2]:
appName = "Model training"
master = "local"

# Create Spark session
spark = SparkSession.builder\
    .appName(appName)\
    .master(master)\
    .getOrCreate()

In [3]:
with open('training.1600000.processed.noemoticon.csv', encoding = "ISO-8859-1") as f:
    reader = csv.reader(f)
    data = list(reader)

In [4]:
df = spark.read.csv("training.1600000.processed.noemoticon.csv")

In [5]:
df_testing = spark.read.csv("testdata.manual.2009.06.14.csv")

In [6]:
df2 = df.filter(df._c0 == 4)
df2.show()

+---+----------+--------------------+--------+---------------+--------------------+
|_c0|       _c1|                 _c2|     _c3|            _c4|                 _c5|
+---+----------+--------------------+--------+---------------+--------------------+
|  4|1467822272|Mon Apr 06 22:22:...|NO_QUERY|          ersle|I LOVE @Health4Ua...|
|  4|1467822273|Mon Apr 06 22:22:...|NO_QUERY|       becca210|im meeting up wit...|
|  4|1467822283|Mon Apr 06 22:22:...|NO_QUERY|      Wingman29|@DaRealSunisaKim ...|
|  4|1467822287|Mon Apr 06 22:22:...|NO_QUERY|      katarinka|Being sick can be...|
|  4|1467822293|Mon Apr 06 22:22:...|NO_QUERY|    _EmilyYoung|@LovesBrooklyn2 h...|
|  4|1467822391|Mon Apr 06 22:22:...|NO_QUERY|  ajarofalmonds|@ProductOfFear Yo...|
|  4|1467822447|Mon Apr 06 22:22:...|NO_QUERY|      vmdavinci|@r_keith_hill Tha...|
|  4|1467822465|Mon Apr 06 22:22:...|NO_QUERY|  jessicavaliyi|@KeepinUpWKris I ...|
|  4|1467822489|Mon Apr 06 22:22:...|NO_QUERY|     emmasaur28|@tommcfly ah, 

In [7]:
text_and_label = df.select("_c5","_c0").withColumnRenamed("_c0","labels").withColumnRenamed("_c5","text")
text_and_label.show()

text_and_label_test = df_testing.select("_c5","_c0").withColumnRenamed("_c0","labels").withColumnRenamed("_c5","text")
text_and_label_test.show()

+--------------------+------+
|                text|labels|
+--------------------+------+
|@switchfoot http:...|     0|
|is upset that he ...|     0|
|@Kenichan I dived...|     0|
|my whole body fee...|     0|
|@nationwideclass ...|     0|
|@Kwesidei not the...|     0|
|         Need a hug |     0|
|@LOLTrish hey  lo...|     0|
|@Tatiana_K nope t...|     0|
|@twittera que me ...|     0|
|spring break in p...|     0|
|I just re-pierced...|     0|
|@caregiving I cou...|     0|
|@octolinz16 It it...|     0|
|@smarrison i woul...|     0|
|@iamjazzyfizzle I...|     0|
|Hollis' death sce...|     0|
|about to file taxes |     0|
|@LettyA ahh ive a...|     0|
|@FakerPattyPattz ...|     0|
+--------------------+------+
only showing top 20 rows

+--------------------+------+
|                text|labels|
+--------------------+------+
|@stellargirl I lo...|     4|
|Reading my kindle...|     4|
|Ok, first assesme...|     4|
|@kenburbary You'l...|     4|
|@mikefish  Fair e...|     4|
|@richardebake

In [8]:
tokenizer = Tokenizer(inputCol="text", outputCol="words") 
tokenized = tokenizer.transform(text_and_label)

tokenized.show()

+--------------------+------+--------------------+
|                text|labels|               words|
+--------------------+------+--------------------+
|@switchfoot http:...|     0|[@switchfoot, htt...|
|is upset that he ...|     0|[is, upset, that,...|
|@Kenichan I dived...|     0|[@kenichan, i, di...|
|my whole body fee...|     0|[my, whole, body,...|
|@nationwideclass ...|     0|[@nationwideclass...|
|@Kwesidei not the...|     0|[@kwesidei, not, ...|
|         Need a hug |     0|      [need, a, hug]|
|@LOLTrish hey  lo...|     0|[@loltrish, hey, ...|
|@Tatiana_K nope t...|     0|[@tatiana_k, nope...|
|@twittera que me ...|     0|[@twittera, que, ...|
|spring break in p...|     0|[spring, break, i...|
|I just re-pierced...|     0|[i, just, re-pier...|
|@caregiving I cou...|     0|[@caregiving, i, ...|
|@octolinz16 It it...|     0|[@octolinz16, it,...|
|@smarrison i woul...|     0|[@smarrison, i, w...|
|@iamjazzyfizzle I...|     0|[@iamjazzyfizzle,...|
|Hollis' death sce...|     0|[h

In [9]:
print(type(tokenized.words))

<class 'pyspark.sql.column.Column'>


In [10]:
tokenizer = Tokenizer().setInputCol("text").setOutputCol("words")
wordsData = tokenizer.transform(text_and_label)

hashingTF = HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)

featurizedData = hashingTF.transform(wordsData)


# word_array = [tokenized.words for row in mvv_list.collect()]
# mvv_array

In [14]:
featurizedData.select("labels", "rawFeatures").show()

+------+--------------------+
|labels|         rawFeatures|
+------+--------------------+
|     0|(20,[1,5,6,7,8,10...|
|     0|(20,[0,1,4,6,7,9,...|
|     0|(20,[0,4,5,7,8,11...|
|     0|(20,[1,7,8,10,11,...|
|     0|(20,[0,2,4,5,6,11...|
|     0|(20,[5,7,17,18],[...|
|     0|(20,[7,17,18],[1....|
|     0|(20,[1,4,6,7,8,9,...|
|     0|(20,[6,8,13,14,17...|
|     0|(20,[7,9,11],[3.0...|
|     0|(20,[3,4,7,8,11,1...|
|     0|(20,[6,7,8,16,17]...|
|     0|(20,[0,3,5,8,9,11...|
|     0|(20,[1,5,6,7,8,9,...|
|     0|(20,[4,5,6,7,10,1...|
|     0|(20,[0,2,3,6,8,9,...|
|     0|(20,[0,2,3,4,5,6,...|
|     0|(20,[8,14,16],[2....|
|     0|(20,[0,3,8,9,10,1...|
|     0|(20,[0,2,4,8,9,11...|
+------+--------------------+
only showing top 20 rows



In [None]:
## Brute force way 

import nltk
nltk.download('punkt')

pandasDF.columns = ["feature", "label"]
feature = pandasDF.feature
labels = pandasDF.label

pandasDFtest.columns = ["feature", "label"]
featuretest = pandasDFtest.feature
labelstest = pandasDFtest.label

# Extract features. 
featuresets = [(label, feature) for index, (label, feature) in pandasDF.iterrows()]
featuresets = featuresets[0::10000]
# featuresetstest = [(label, feature) for index, (label, feature) in pandasDFtest.iterrows()]

all_words = set(word.lower() for passage in featuresets for word in word_tokenize(passage[0]))
t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in featuresets]

#  Train a classifier
classifier = nbc.train(t)
# # Test classifier on "Neo"

test_sentence = "This is the best band I've ever heard!"

# test_sent_features = {word: (word in word_tokenize(test_sentence.lower())) for word in all_words}
# print(test_sent_features)

classifier.classify(test_sent_features)

classifier.show_most_informative_features(15)
