In [1]:
from pyspark.sql.functions import isnull,sum

In [2]:
# Importing pyspark and starting session to use sprak functionality 
import pyspark 
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,when,isnan,count
from pyspark.sql.types import IntegerType, StructType,StructField, FloatType , DoubleType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler,StandardScaler
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import regexp_replace
from pyspark.ml.regression import LinearRegression 
spark = SparkSession.builder.appName("suicide").getOrCreate()


In [3]:
# reading csv file
dataset = spark.read.csv('suicide.csv',inferSchema=True,header=True) # infer schema automatocally guess column datatypes
dataset.show(5)

+-------+----+------+-----------+-----------+----------+-----------------+------------+------------+------------------+------------------+---------------+
|country|year|   sex|        age|suicides_no|population|suicides/100k pop|country-year|HDI for year| gdp_for_year ($) |gdp_per_capita ($)|     generation|
+-------+----+------+-----------+-----------+----------+-----------------+------------+------------+------------------+------------------+---------------+
|Albania|1987|  male|15-24 years|         21|    312900|             6.71| Albania1987|        null|     2,156,624,900|               796|   Generation X|
|Albania|1987|  male|35-54 years|         16|    308000|             5.19| Albania1987|        null|     2,156,624,900|               796|         Silent|
|Albania|1987|female|15-24 years|         14|    289700|             4.83| Albania1987|        null|     2,156,624,900|               796|   Generation X|
|Albania|1987|  male|  75+ years|          1|     21800|             4

In [4]:
dataset.printSchema()

root
 |-- country: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: string (nullable = true)
 |-- suicides_no: integer (nullable = true)
 |-- population: integer (nullable = true)
 |-- suicides/100k pop: double (nullable = true)
 |-- country-year: string (nullable = true)
 |-- HDI for year: double (nullable = true)
 |--  gdp_for_year ($) : string (nullable = true)
 |-- gdp_per_capita ($): integer (nullable = true)
 |-- generation: string (nullable = true)



In [5]:
dataset.count()

27820

In [6]:
null_count = dataset.select([sum(isnull(c).cast("int")).alias(c) for c in dataset.columns])

# print the count of null values in each column
null_count.show()

+-------+----+---+---+-----------+----------+-----------------+------------+------------+------------------+------------------+----------+
|country|year|sex|age|suicides_no|population|suicides/100k pop|country-year|HDI for year| gdp_for_year ($) |gdp_per_capita ($)|generation|
+-------+----+---+---+-----------+----------+-----------------+------------+------------+------------------+------------------+----------+
|      0|   0|  0|  0|          0|         0|                0|           0|       19456|                 0|                 0|         0|
+-------+----+---+---+-----------+----------+-----------------+------------+------------+------------------+------------------+----------+



In [7]:
df = dataset.drop("HDI for year", "country","year")
df.show()

+------+-----------+-----------+----------+-----------------+------------+------------------+------------------+---------------+
|   sex|        age|suicides_no|population|suicides/100k pop|country-year| gdp_for_year ($) |gdp_per_capita ($)|     generation|
+------+-----------+-----------+----------+-----------------+------------+------------------+------------------+---------------+
|  male|15-24 years|         21|    312900|             6.71| Albania1987|     2,156,624,900|               796|   Generation X|
|  male|35-54 years|         16|    308000|             5.19| Albania1987|     2,156,624,900|               796|         Silent|
|female|15-24 years|         14|    289700|             4.83| Albania1987|     2,156,624,900|               796|   Generation X|
|  male|  75+ years|          1|     21800|             4.59| Albania1987|     2,156,624,900|               796|G.I. Generation|
|  male|25-34 years|          9|    274300|             3.28| Albania1987|     2,156,624,900|    

In [8]:
df = df.withColumnRenamed("suicides/100k pop", "suicides_per_100kpop").withColumnRenamed("gdp_for_year ($)", "gdp_year").withColumnRenamed("gdp_per_capita ($)", "gdp_percapita")
df.show(5)

+------+-----------+-----------+----------+--------------------+------------+------------------+-------------+---------------+
|   sex|        age|suicides_no|population|suicides_per_100kpop|country-year| gdp_for_year ($) |gdp_percapita|     generation|
+------+-----------+-----------+----------+--------------------+------------+------------------+-------------+---------------+
|  male|15-24 years|         21|    312900|                6.71| Albania1987|     2,156,624,900|          796|   Generation X|
|  male|35-54 years|         16|    308000|                5.19| Albania1987|     2,156,624,900|          796|         Silent|
|female|15-24 years|         14|    289700|                4.83| Albania1987|     2,156,624,900|          796|   Generation X|
|  male|  75+ years|          1|     21800|                4.59| Albania1987|     2,156,624,900|          796|G.I. Generation|
|  male|25-34 years|          9|    274300|                3.28| Albania1987|     2,156,624,900|          796| 

In [9]:
df.printSchema()

root
 |-- sex: string (nullable = true)
 |-- age: string (nullable = true)
 |-- suicides_no: integer (nullable = true)
 |-- population: integer (nullable = true)
 |-- suicides_per_100kpop: double (nullable = true)
 |-- country-year: string (nullable = true)
 |--  gdp_for_year ($) : string (nullable = true)
 |-- gdp_percapita: integer (nullable = true)
 |-- generation: string (nullable = true)



In [10]:
column_names = df.columns

# print the column names
print(column_names)

['sex', 'age', 'suicides_no', 'population', 'suicides_per_100kpop', 'country-year', ' gdp_for_year ($) ', 'gdp_percapita', 'generation']


In [11]:
df.toPandas().to_excel('final.xlsx', sheet_name = 'Sheet1', index = False)

In [12]:
df.printSchema()

root
 |-- sex: string (nullable = true)
 |-- age: string (nullable = true)
 |-- suicides_no: integer (nullable = true)
 |-- population: integer (nullable = true)
 |-- suicides_per_100kpop: double (nullable = true)
 |-- country-year: string (nullable = true)
 |--  gdp_for_year ($) : string (nullable = true)
 |-- gdp_percapita: integer (nullable = true)
 |-- generation: string (nullable = true)



In [13]:
strinx = StringIndexer(inputCols = ["sex", "age", "country-year", "generation"," gdp_for_year ($) "],outputCols =["sex_Trans", "age_Trans", "country-year_Trans", "generation_Trans","gdp_for_year_Trans"] )

In [14]:
df = strinx.fit(df).transform(df)

In [15]:
df.show()

+------+-----------+-----------+----------+--------------------+------------+------------------+-------------+---------------+---------+---------+------------------+----------------+------------------+
|   sex|        age|suicides_no|population|suicides_per_100kpop|country-year| gdp_for_year ($) |gdp_percapita|     generation|sex_Trans|age_Trans|country-year_Trans|generation_Trans|gdp_for_year_Trans|
+------+-----------+-----------+----------+--------------------+------------+------------------+-------------+---------------+---------+---------+------------------+----------------+------------------+
|  male|15-24 years|         21|    312900|                6.71| Albania1987|     2,156,624,900|          796|   Generation X|      1.0|      0.0|               0.0|             0.0|             718.0|
|  male|35-54 years|         16|    308000|                5.19| Albania1987|     2,156,624,900|          796|         Silent|      1.0|      2.0|               0.0|             1.0|          

In [16]:
onenc = OneHotEncoder(inputCols = ["sex_Trans", "age_Trans", "country-year_Trans", "generation_Trans","gdp_for_year_Trans"],
                      outputCols =["sex_VEC", "age_VEC", "country-year_VEC", "generation_VEC","gdp_for_year_VEC"] )
df = onenc.fit(df).transform(df)

In [17]:
df.show(5,vertical=True)

-RECORD 0----------------------------------
 sex                  | male               
 age                  | 15-24 years        
 suicides_no          | 21                 
 population           | 312900             
 suicides_per_100kpop | 6.71               
 country-year         | Albania1987        
  gdp_for_year ($)    | 2,156,624,900      
 gdp_percapita        | 796                
 generation           | Generation X       
 sex_Trans            | 1.0                
 age_Trans            | 0.0                
 country-year_Trans   | 0.0                
 generation_Trans     | 0.0                
 gdp_for_year_Trans   | 718.0              
 sex_VEC              | (1,[],[])          
 age_VEC              | (5,[0],[1.0])      
 country-year_VEC     | (2320,[0],[1.0])   
 generation_VEC       | (5,[0],[1.0])      
 gdp_for_year_VEC     | (2320,[718],[1.0]) 
-RECORD 1----------------------------------
 sex                  | male               
 age                  | 35-54 ye

In [18]:
column_names = df.columns

# print the column names
print(column_names)

['sex', 'age', 'suicides_no', 'population', 'suicides_per_100kpop', 'country-year', ' gdp_for_year ($) ', 'gdp_percapita', 'generation', 'sex_Trans', 'age_Trans', 'country-year_Trans', 'generation_Trans', 'gdp_for_year_Trans', 'sex_VEC', 'age_VEC', 'country-year_VEC', 'generation_VEC', 'gdp_for_year_VEC']


In [19]:
df_dropped = df.drop("sex", "age", "country-year", "generation", " gdp_for_year ($) ")
df_dropped.show()

+-----------+----------+--------------------+-------------+---------+---------+------------------+----------------+------------------+-------------+-------------+----------------+--------------+------------------+
|suicides_no|population|suicides_per_100kpop|gdp_percapita|sex_Trans|age_Trans|country-year_Trans|generation_Trans|gdp_for_year_Trans|      sex_VEC|      age_VEC|country-year_VEC|generation_VEC|  gdp_for_year_VEC|
+-----------+----------+--------------------+-------------+---------+---------+------------------+----------------+------------------+-------------+-------------+----------------+--------------+------------------+
|         21|    312900|                6.71|          796|      1.0|      0.0|               0.0|             0.0|             718.0|    (1,[],[])|(5,[0],[1.0])|(2320,[0],[1.0])| (5,[0],[1.0])|(2320,[718],[1.0])|
|         16|    308000|                5.19|          796|      1.0|      2.0|               0.0|             1.0|             718.0|    (1,[],

In [20]:
new_column_names = df_dropped.columns


In [21]:
Va = VectorAssembler()
vA = Va.setParams(inputCols=new_column_names, outputCol='features')
df_dropped = vA.transform(df_dropped)
df.show(5,vertical=True)

-RECORD 0----------------------------------
 sex                  | male               
 age                  | 15-24 years        
 suicides_no          | 21                 
 population           | 312900             
 suicides_per_100kpop | 6.71               
 country-year         | Albania1987        
  gdp_for_year ($)    | 2,156,624,900      
 gdp_percapita        | 796                
 generation           | Generation X       
 sex_Trans            | 1.0                
 age_Trans            | 0.0                
 country-year_Trans   | 0.0                
 generation_Trans     | 0.0                
 gdp_for_year_Trans   | 718.0              
 sex_VEC              | (1,[],[])          
 age_VEC              | (5,[0],[1.0])      
 country-year_VEC     | (2320,[0],[1.0])   
 generation_VEC       | (5,[0],[1.0])      
 gdp_for_year_VEC     | (2320,[718],[1.0]) 
-RECORD 1----------------------------------
 sex                  | male               
 age                  | 35-54 ye

In [22]:
splt = df_dropped.randomSplit([0.7,0.3])
train_df= splt[0]
test_df = splt[1]

In [23]:
# linear Regression
from pyspark.ml.regression import LinearRegression
regre = LinearRegression(featuresCol = 'features', labelCol='suicides_no', maxIter=10)
Regremodel = regre.fit(train_df)
print("Intercept: " + str(Regremodel.intercept))

Intercept: -12.410825027607643


In [24]:
summary = Regremodel.summary
summary.rootMeanSquaredError


19.16323914950861

In [25]:
# Factorization machines regressor
from pyspark.ml import Pipeline
from pyspark.ml.regression import FMRegressor
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.evaluation import RegressionEvaluator

fm = FMRegressor(featuresCol="features",labelCol = 'suicides_no', stepSize=0.001)
fmmodel = fm.fit(train_df)

In [26]:
predictions = fmmodel.transform(test_df)

In [27]:
predictions.select('suicides_no','prediction', 'features').show(10)

+-----------+-------------------+--------------------+
|suicides_no|         prediction|            features|
+-----------+-------------------+--------------------+
|          0| -390.3924358107498|(4660,[1,3,4,5,6,...|
|          0| -408.0809664503007|(4660,[1,3,4,5,6,...|
|          0| -532.9126499032207|(4660,[1,3,4,5,6,...|
|          0|   -557.86813987421|(4660,[1,3,4,5,6,...|
|          0|-413.56270165549904|(4660,[1,3,5,6,7,...|
|          0|-416.63267667039577|(4660,[1,3,5,6,7,...|
|          0| -422.4667886608918|(4660,[1,3,5,6,7,...|
|          0|  816.6060537853714|(4660,[1,3,4,5,6,...|
|          0| -427.7892091728612|(4660,[1,3,5,6,7,...|
|          0|-479.57327604680836|(4660,[1,3,4,5,6,...|
+-----------+-------------------+--------------------+
only showing top 10 rows



In [28]:
evaluator = RegressionEvaluator(
    labelCol="suicides_no", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 130151
