# Logistic Regression - Extra Credit
## Wright, Kyle

In [125]:
spark.version


'2.3.2'

In [126]:
spark.sparkContext

## Input files

In [128]:
input_spam = "C:/Spark/spam.txt"
input_ham = "C:/Spark/nospam.txt"

## Create RDDs

In [129]:
spam = spark.sparkContext.textFile(input_spam)

In [130]:
nospam = spark.sparkContext.textFile(input_ham)

##  Create term frequency

In [157]:
tf = HashingTF(numFeatures = 256)

## Featurize RDDs

In [158]:
featurize_spam = spam.map(lambda x: tf.transform(x.split(" ")))

In [159]:
featurize_nospam = nospam.map(lambda x: tf.transform(x.split(" ")))

## Label spam and nospam to get trained

In [160]:
negativeExamples = featurize_spam.map(lambda features: LabeledPoint(1, features))

In [161]:
positiveExamples = featurize_nospam.map(lambda features: LabeledPoint(0, features))

## Combine featurized spam and nospam

In [162]:
Training_data = negativeExamples.union(positiveExamples)

In [163]:
Training_data.cache()

UnionRDD[1324] at union at NativeMethodAccessorImpl.java:0

## Train on data

In [164]:
model = LogisticRegressionWithSGD().train(Training_data)

## Input test file

In [165]:
input_10emails = "C:/Spark/10emails.txt"

In [166]:
predictrdd = spark.sparkContext.textFile(input_10emails)

## Perform prediction regression

In [167]:
testemails = predictrdd.map(lambda email: (email, model.predict(tf.transform(email.split(" ")))))
testemails.collect()

[("e100\tSubject: web access to our email system Do we have a web access to our email system? Hi Alex, Here are the directions: go to https://mail.limelife.com/exchange (please note the 's' in the web address - it's a secure server so you have to have the 's' there) if you are asked to verify a certificate, please accept and continue",
  1),
 ('e200\tHow I Lost 30 Lbs If you are experiencing difficulty viewing this important message You received this message because you signed up at a select affiliate website. If you wish to unsubscribe from our newsletter please click or send post email to: 1703 Sudderth Dr,,# 341,Ruidoso,NM,88345',
  1),
 ('e300\tImpacting Liberty Today: Harvard student is booted from apartment for owning guns. Census confirms that most immigrants are on welfare. And an illegal alien violently rapes mother. Census Confirms: Most Immigrants On Welfare Analyzing data from 2014, the bureau found that a whopping 63 percent of non-citizen immigrants are using a welfare pr

## Convert for output

In [170]:
output = testemails.map(lambda x: (x[0].split("\t"), x[1])).map(
    lambda x: (x[0][0], 'spam') if x[1] == 1 else (x[0][0], 'nospam'))

In [171]:
output.collect()

[('e100', 'spam'),
 ('e200', 'spam'),
 ('e300', 'spam'),
 ('e400', 'nospam'),
 ('e500', 'nospam'),
 ('e600', 'nospam'),
 ('e700', 'nospam'),
 ('e800', 'spam'),
 ('e801', 'nospam'),
 ('e802', 'nospam')]