<a href="https://colab.research.google.com/github/jad3g/Analytics-suicide-rate/blob/Data-Analysis/Big_Analytics_For_Suicide_Rate_Overview_Rate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
!ls /usr/lib/jvm



In [None]:
!wget -q https://archive.apache.org/dist/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz


In [None]:
!ls

In [None]:
!du -sh spark-3.0.3-bin-hadoop3.2.tgz


In [None]:
!tar -xvzf spark-3.0.3-bin-hadoop3.2.tgz

In [None]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.3-bin-hadoop3.2"

In [None]:
!pip install pyspark==3.0.3
!pip install -q findspark
import findspark
findspark.init()
findspark.find()

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import *
from pyspark.sql.functions import col, sum
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.clustering import KMeans, KMeansSummary
from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansSummary
from pyspark.ml.evaluation import ClusteringEvaluator
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [None]:
spark = SparkSession.builder.appName('SuicideRatesOverview').getOrCreate()
spark

In [None]:
cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
cores

**Reading the Dataset**

In [None]:
data = spark.read.csv('/content/master.csv', inferSchema=True, header=True)
data

In [None]:
data.show()

In [None]:
data.toPandas()

In [None]:
# data validation
data.columns

In [None]:
data.head()

In [None]:
data.printSchema()

In [None]:
print(data.printSchema())
print("")
print(data.columns)
print("")
print(data.describe())

In [None]:
from pyspark.sql.types import StructField,StringType,IntegerType,StructType,DoubleType


In [None]:
from struct import Struct
from pickle import TRUE
data_schema = [StructField("country", StringType(), True),\
               StructField("year", IntegerType(), True),\
               StructField("sex", StringType(), True),\
               StructField("age", StringType(), True),\
               StructField("suicide_no", IntegerType(), True),\
               StructField("population", IntegerType(), True),\
               StructField("suicides/100k pop", DoubleType(), True),\
               StructField("country-year", StringType(), True),\
               StructField("HDI for year", DoubleType(), True),\
               StructField("gdp_for_year ($)", StringType(), True),\
               StructField("gdp_per_capita ($)", IntegerType(), True),\
               StructField("generation", StringType(), True)]

In [None]:
final_struc = StructType(fields=data_schema)
final_struc

In [None]:
data = spark.read.csv('/content/master.csv', schema=final_struc)
data

In [None]:
data.printSchema()

In [None]:
print('Number of rows:', data.count())
print('Number of columns:', len(data.columns))

In [None]:
data.columns

In [None]:
data.describe().show()

In [None]:
# Finding the missing values
# Check for null values in all columns
from pyspark.sql.functions import col, isnan, when, count

null_counts = data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns])

# Show the counts of null values in each column
null_counts.show()

In [None]:
data_fill = data.fillna(0)
data_fill.show()

In [None]:
data_fill.describe().show()

In [None]:
data_fill.groupBy("country").count().show()

In [None]:
data_fill.groupBy("country").mean("suicide_no").show()

In [None]:
data_fill.select("suicide_no", "suicides/100k pop", "HDI for year", "gdp_per_capita ($)").summary("count","min","25%","50%","75%","max").show()

In [None]:
# converting categorical variables into numerical variables using the String Indexer
# create a list of the categorical columns
cat_cols = ["country", "sex", "age","gdp_for_year ($)", "country-year", "generation"]

# instantiate string index for the categorical variables
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index").fit(data_fill)for col in cat_cols]

# apply transformation to dataframe
indexed_data = data_fill
for indexer in indexers:
  indexed_data = indexer.transform(indexed_data)

indexed_data.show()


In [None]:
data_ML = indexed_data.drop("country","sex","age","country-year","gdp_for_year ($)","generation")
data_ML.show()

**APPLYING PYSPARK MACHINE LEARNING CLUSTERING TECHNIQUE ON SUICIDE RATE OVERVIEW**

In [None]:
# creating a vector assembler for the dataset
input_columns = data_ML.columns

# create the vector
vecAssembler = VectorAssembler(inputCols=input_columns, outputCol="features")
data_ML_KMeans = vecAssembler.transform(data_ML)
data_ML_KMeans.show()


In [None]:
# set a max for the number of clusters needed
kmax = 50
# creating an array filled with zeros for the amount of K
kmcost = np.zeros(kmax)
for k in range(2,kmax):

    kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
    # fit to dataset
    model = kmeans.fit(data_ML_KMeans)

    # compute the "cost" (sum of squared distances) between the input points and their corresponding cluster centers
    kmcost[k] = model.summary.trainingCost

print(kmcost[2:kmax])



In [None]:
#Plot the cost vs number of Clusters
fig, ax = plt.subplots(1,1, figsize =(10,8))
plt.plot(range(2,kmax), kmcost[2:kmax])
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Cost")
plt.title("Elbow Method for Optimal k")
plt.show()

In [None]:
## Fit the final model
k = 8
kmeans = KMeans().setK(k).setSeed(3).setFeaturesCol("features")
model = kmeans.fit(data_ML_KMeans)

predictions = model.transform(data_ML_KMeans)

evaluator = ClusteringEvaluator()

silhouette_score = evaluator.evaluate(predictions)
print("Silhouette Score = " + str(silhouette_score))

In [None]:
centers = model.clusterCenters()
for centers in centers:
    print(centers)

In [None]:
predictions.toPandas()


In [None]:
predictions.groupBy("prediction").agg(min(predictions.suicide_no), max(predictions.suicide_no)).show()

In [None]:
## BisectingKMeans
kmax = 50
bkmcost = np.zeros(kmax)
for k in range(2, kmax):
    bkmeans = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("features")
    model_bk = bkmeans.fit(data_ML_KMeans)
    bkmcost[k] = model_bk.summary.trainingCost

print(bkmcost[2:kmax])



In [None]:
fig, ax = plt.subplots(1,1, figsize =(10,8))
ax.plot(range(2,kmax),bkmcost[2:kmax])
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Cost")
plt.title("Elbow Method for Optimal k")
plt.show()

In [None]:
# Fit the final model
k = 8
bkmeans = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("features")
model = bkmeans.fit(data_ML_KMeans)

predictions = model.transform(data_ML_KMeans)

evaluator = ClusteringEvaluator()

silhouette_bkmeans_score = evaluator.evaluate(predictions)
print("Silhouette_bkmeans_score = " + str(silhouette_bkmeans_score))

In [None]:
predictions.groupBy("prediction").agg(min(predictions.suicide_no), max(predictions.suicide_no)).show()