# Linear Regression demo using just one feature.

In [1]:
import findspark
findspark.init('C:/spark-2.3.2-bin-hadoop2.7')

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, avg, col, concat, desc, explode, lit, min, max, split
from pyspark.sql.types import IntegerType

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer, IDF, Normalizer, PCA, RegexTokenizer, StandardScaler, StopWordsRemover, VectorAssembler, StringIndexer, MinMaxScaler, PCA
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

import re

In [3]:
data_dir = 'C:/Users/John/PycharmProjects/customer-attrition/data/'

In [4]:
spark = SparkSession.builder.appName("spark machine learning").getOrCreate()

In [5]:
df = spark.read.json(data_dir + 'Train_onetag_small.json')
df.persist()

DataFrame[Body: string, Id: bigint, Tags: string, Title: string, oneTag: string]

In [6]:
regexTokenizer = RegexTokenizer(inputCol='Body', outputCol='words', pattern="\\W")
df = regexTokenizer.transform(df)
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

In [7]:
body_length = udf(lambda x: len(x), IntegerType())
df = df.withColumn('BodyLength', body_length('words'))

In [8]:
number_of_paragraphs = udf(lambda x: len(re.findall('</p>',x)), IntegerType())
number_of_links = udf(lambda x: len(re.findall('</a>',x)), IntegerType())

In [9]:
df = df.withColumn('NumParagraphs', number_of_paragraphs('Body'))
df = df.withColumn('NumLinks', number_of_links('Body'))

In [10]:
assembler = VectorAssembler(inputCols = ['BodyLength', 'NumParagraphs', 'NumLinks'], outputCol = "NumFeatures")
df = assembler.transform(df)

In [11]:
scaler = Normalizer(inputCol = 'NumFeatures', outputCol ='ScaledNumFeatures' )
df = scaler.transform(df)

In [12]:
scaler2 = StandardScaler(inputCol = 'NumFeatures', outputCol ='ScaledNumFeatures2', withStd = True)
scalerModel = scaler2.fit(df)
df = scalerModel.transform(df)

In [13]:
cv = CountVectorizer(inputCol='words', outputCol='TF', vocabSize = 1000)
cvmodel = cv.fit(df)
df = cvmodel.transform(df)
df.take(2)

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

In [14]:
idf = IDF(inputCol = 'TF', outputCol = 'TFIDF')
idfmodel = idf.fit(df)
df = idfmodel.transform(df)
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

In [15]:
pca = PCA(k = 100, inputCol = 'TFIDF', outputCol = 'pcaTFIDF')
model = pca.fit(df)
df = model.transform(df)

In [16]:
number_of_tags = udf(lambda x: len(x.split(" ")), IntegerType())
df = df.withColumn('NumTags', number_of_tags(df.Tags))

In [17]:
df.groupby('NumTags').count().orderBy('NumTags').show()

+-------+-----+
|NumTags|count|
+-------+-----+
|      1|13858|
|      2|26540|
|      3|28769|
|      4|19108|
|      5|11725|
+-------+-----+



In [18]:
# Below is the three different ways of doing the same thing. Written just for educational purposes.
df.groupBy('NumTags').agg({'BodyLength':'avg'}).orderBy('NumTags').show()

+-------+------------------+
|NumTags|   avg(BodyLength)|
+-------+------------------+
|      1|135.41311877615817|
|      2|153.82456669178598|
|      3|172.73704334526747|
|      4|192.67050450073268|
|      5|218.54251599147122|
+-------+------------------+



In [19]:
#df.groupBy('NumTags').agg(avg('BodyLength')).orderBy('NumTags').show()

In [20]:
#df.groupBy('NumTags').agg(avg(col('BodyLength'))).orderBy('NumTags').show()

In [21]:
assembler_1 = VectorAssembler(inputCols = ['BodyLength'], outputCol = "LengthFeature")
df = assembler_1.transform(df)

In [22]:
lr = LinearRegression(maxIter=5, regParam=0.0, fitIntercept=False, solver='normal')

In [23]:
data = df.select([col('NumTags').alias('label'), col('LengthFeature').alias('features')])

In [24]:
lrmodel = lr.fit(data)

In [25]:
lrmodel.coefficients

DenseVector([0.0079])

In [26]:
lrmodel.intercept

0.0

In [27]:
lrModelSummary = lrmodel.summary

In [28]:
lrModelSummary.r2

0.4248176257607954

In [29]:
df.persist()

DataFrame[Body: string, Id: bigint, Tags: string, Title: string, oneTag: string, words: array<string>, BodyLength: int, NumParagraphs: int, NumLinks: int, NumFeatures: vector, ScaledNumFeatures: vector, ScaledNumFeatures2: vector, TF: vector, TFIDF: vector, pcaTFIDF: vector, NumTags: int, LengthFeature: vector]

# Logistic Regression

In [32]:
indexer = StringIndexer(inputCol = 'oneTag', outputCol = 'label_tag')
df = indexer.fit(df).transform(df)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:53346)
Traceback (most recent call last):
  File "C:\spark-2.3.2-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\spark-2.3.2-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:53346)

In [61]:
data2 = df.select(col('label').alias('label'), col('TFIDF').alias('features'))
data2.head()

AnalysisException: "cannot resolve '`label`' given input columns: [Id, TFIDF, BodyLength, TF, LengthFeature, ScaledNumFeatures2, Title, NumLinks, NumParagraphs, NumTags, oneTag, ScaledNumFeatures, Tags, words, Body, NumFeatures, pcaTFIDF];;\n'Project ['label AS label#1543, TFIDF#1016 AS features#1544]\n+- Project [Body#777, Id#778L, Tags#779, Title#780, oneTag#781, words#787, BodyLength#820, NumParagraphs#828, NumLinks#837, NumFeatures#850, ScaledNumFeatures#861, ScaledNumFeatures2#903, TF#943, TFIDF#1016, pcaTFIDF#1090, NumTags#1106, UDF(named_struct(BodyLength_double_VectorAssembler_441bad295676a6e50cda, cast(BodyLength#820 as double))) AS LengthFeature#1241]\n   +- Project [Body#777, Id#778L, Tags#779, Title#780, oneTag#781, words#787, BodyLength#820, NumParagraphs#828, NumLinks#837, NumFeatures#850, ScaledNumFeatures#861, ScaledNumFeatures2#903, TF#943, TFIDF#1016, pcaTFIDF#1090, <lambda>(Tags#779) AS NumTags#1106]\n      +- Project [Body#777, Id#778L, Tags#779, Title#780, oneTag#781, words#787, BodyLength#820, NumParagraphs#828, NumLinks#837, NumFeatures#850, ScaledNumFeatures#861, ScaledNumFeatures2#903, TF#943, TFIDF#1016, UDF(TFIDF#1016) AS pcaTFIDF#1090]\n         +- Project [Body#777, Id#778L, Tags#779, Title#780, oneTag#781, words#787, BodyLength#820, NumParagraphs#828, NumLinks#837, NumFeatures#850, ScaledNumFeatures#861, ScaledNumFeatures2#903, TF#943, UDF(TF#943) AS TFIDF#1016]\n            +- Project [Body#777, Id#778L, Tags#779, Title#780, oneTag#781, words#787, BodyLength#820, NumParagraphs#828, NumLinks#837, NumFeatures#850, ScaledNumFeatures#861, ScaledNumFeatures2#903, UDF(words#787) AS TF#943]\n               +- Project [Body#777, Id#778L, Tags#779, Title#780, oneTag#781, words#787, BodyLength#820, NumParagraphs#828, NumLinks#837, NumFeatures#850, ScaledNumFeatures#861, UDF(NumFeatures#850) AS ScaledNumFeatures2#903]\n                  +- Project [Body#777, Id#778L, Tags#779, Title#780, oneTag#781, words#787, BodyLength#820, NumParagraphs#828, NumLinks#837, NumFeatures#850, UDF(NumFeatures#850) AS ScaledNumFeatures#861]\n                     +- Project [Body#777, Id#778L, Tags#779, Title#780, oneTag#781, words#787, BodyLength#820, NumParagraphs#828, NumLinks#837, UDF(named_struct(BodyLength_double_VectorAssembler_43f38441b20984c56b5b, cast(BodyLength#820 as double), NumParagraphs_double_VectorAssembler_43f38441b20984c56b5b, cast(NumParagraphs#828 as double), NumLinks_double_VectorAssembler_43f38441b20984c56b5b, cast(NumLinks#837 as double))) AS NumFeatures#850]\n                        +- Project [Body#777, Id#778L, Tags#779, Title#780, oneTag#781, words#787, BodyLength#820, NumParagraphs#828, <lambda>(Body#777) AS NumLinks#837]\n                           +- Project [Body#777, Id#778L, Tags#779, Title#780, oneTag#781, words#787, BodyLength#820, <lambda>(Body#777) AS NumParagraphs#828]\n                              +- Project [Body#777, Id#778L, Tags#779, Title#780, oneTag#781, words#787, <lambda>(words#787) AS BodyLength#820]\n                                 +- Project [Body#777, Id#778L, Tags#779, Title#780, oneTag#781, UDF(Body#777) AS words#787]\n                                    +- Relation[Body#777,Id#778L,Tags#779,Title#780,oneTag#781] json\n"