In [54]:
import findspark
findspark.init('C:/spark-2.3.2-bin-hadoop2.7')

In [73]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, avg, col, concat, desc, explode, lit, min, max, split, stddev_pop, stddev_samp, stddev, count
from pyspark.sql.types import IntegerType

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer, IDF, Normalizer, PCA, RegexTokenizer, StandardScaler, StopWordsRemover, VectorAssembler, StringIndexer, MinMaxScaler, PCA
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.clustering import KMeans

import re

data_dir = 'C:/Users/John/PycharmProjects/customer-attrition/data/'

In [56]:
spark = SparkSession.builder.appName("spark machine learning").getOrCreate()

In [57]:
df = spark.read.json(data_dir + 'Train_onetag_small.json')
df.persist()

DataFrame[Body: string, Id: bigint, Tags: string, Title: string, oneTag: string]

In [58]:
df = df.withColumn("Desc", concat(col("Title"), lit(' '), col("Body")))

In [59]:
regexTokenizer = RegexTokenizer(inputCol="Desc", outputCol="words", pattern="\\W")
df = regexTokenizer.transform(df)

In [60]:
body_length = udf(lambda x: len(x), IntegerType())
df = df.withColumn("DescLength", body_length(df.words))

In [61]:
assembler = VectorAssembler(inputCols=["DescLength"], outputCol="DescVec")
df = assembler.transform(df)

In [62]:
number_of_tags = udf(lambda x: len(x.split(' ')), IntegerType())
df = df.withColumn("NumTags", number_of_tags(df.Tags))

In [63]:
df.agg(min('DescLength')).show()

+---------------+
|min(DescLength)|
+---------------+
|             10|
+---------------+



In [64]:
df.agg(max('DescLength')).show()

+---------------+
|max(DescLength)|
+---------------+
|           7532|
+---------------+



In [65]:
df.agg(avg('DescLength'), stddev('DescLength')).show()

+---------------+-----------------------+
|avg(DescLength)|stddev_samp(DescLength)|
+---------------+-----------------------+
|      180.28187|     192.10819533505136|
+---------------+-----------------------+



In [67]:
kmeans = KMeans().setParams(featuresCol='DescVec', predictionCol='DescGroup', k=5, seed=42)
model = kmeans.fit(df)
df = model.transform(df)

In [68]:
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', Desc="How to check if an uploaded file is an image without mime type? <p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an imag

In [74]:
df.groupBy('DescGroup').agg(avg(col('DescLength')), avg(col('NumTags')), count(col('DescLength'))).orderBy("avg(DescLength)").show()

+---------+------------------+------------------+-----------------+
|DescGroup|   avg(DescLength)|      avg(NumTags)|count(DescLength)|
+---------+------------------+------------------+-----------------+
|        0|102.79746853837882| 2.764364809486442|            68814|
|        3|267.89725946604807|3.1206699956749104|            25433|
|        2| 581.5742473555737|3.2394222945484135|             4916|
|        4| 1282.741620111732| 3.275139664804469|              716|
|        1| 3003.214876033058|3.6115702479338845|              121|
+---------+------------------+------------------+-----------------+



In [8]:
token_length = udf(lambda x: len(x), IntegerType())

In [9]:
df = df.withColumn("Title_Body_Tk_Length", token_length('Title_Body_tokens'))

In [10]:
print(df.head())

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', Title_Body_Joined="How to check if an uploaded file is an image without mime type? <p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded fi

# what is a description length?


I'll refer to length of the combined Title and Body fields as Description Length 
(and by length we mean the number of words when the text is tokenized with pattern="\W").

In [25]:
# How many times greater is the Description Length of the longest question than the Description Length 
# of the shortest question (rounded to the nearest whole number)?

print(df.agg({'Title_Body_Tk_Length':'max'}).collect()[0]['max(Title_Body_Tk_Length)']/\
df.agg({'Title_Body_Tk_Length':'min'}).collect()[0]['min(Title_Body_Tk_Length)'])

753.2


In [51]:
df.agg(min('Title_Body_Tk_Length')).show()

+-------------------------+
|min(Title_Body_Tk_Length)|
+-------------------------+
|                       10|
+-------------------------+



In [52]:
df.agg(max('Title_Body_Tk_Length')).show()

+-------------------------+
|max(Title_Body_Tk_Length)|
+-------------------------+
|                     7532|
+-------------------------+



In [31]:
# What is the mean and standard deviation of the Description length?
average = df.agg({'Title_Body_Tk_Length':'avg'}).collect()[0]['avg(Title_Body_Tk_Length)']
print(average)

180.28187


In [53]:
df.agg(avg('Title_Body_Tk_Length'), stddev('Title_Body_Tk_Length')).show()

+-------------------------+---------------------------------+
|avg(Title_Body_Tk_Length)|stddev_samp(Title_Body_Tk_Length)|
+-------------------------+---------------------------------+
|                180.28187|               192.10819533505136|
+-------------------------+---------------------------------+



In [32]:
# What is the mean and standard deviation of the Description length?
std_dev = df.agg({'Title_Body_Tk_Length':'stddev'}).collect()[0]['stddev(Title_Body_Tk_Length)']
print(std_dev)

192.10819533505136


In [33]:
# What is the mean and standard deviation of the Description length?
std_pop = df.agg({'Title_Body_Tk_Length':'stddev_pop'}).collect()[0]['stddev_pop(Title_Body_Tk_Length)']
print(std_pop)

192.10723479167333


In [34]:
# What is the mean and standard deviation of the Description length?
std_samp = df.agg({'Title_Body_Tk_Length':'stddev_samp'}).collect()[0]['stddev_samp(Title_Body_Tk_Length)']
print(std_samp)

192.10819533505136


In [37]:
assembler = VectorAssembler(inputCols=['Title_Body_Tk_Length'], outputCol='features')
df = assembler.transform(df)

In [38]:
print(df.head())

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', Title_Body_Joined="How to check if an uploaded file is an image without mime type? <p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded fi

In [36]:
kmeans = KMeans().setK(5).setSeed(42)

In [39]:
dataset = df.select(col('features'))

In [40]:
model = kmeans.fit(dataset)

In [41]:
predictions = model.transform(dataset)

In [44]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[103.45126401]
[3045.86086957]
[592.30449606]
[271.27832757]
[1306.64680233]
