In [2]:
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('basics').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/04 10:10:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/10/04 10:10:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/10/04 10:10:46 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
df = spark.read.csv('dataset1-1/part-00000-08c7db4a-479f-4c60-9ae4-4ae109f8ec54-c000.csv', header=True)
df.show()



+-----------+------+---------------------+---------------------+--------------------+-----------+------------------+
|    Country|Gender|Demographics Question|Demographics Response|            Question|Survey Year|             Value|
+-----------+------+---------------------+---------------------+--------------------+-----------+------------------+
|Afghanistan|     F|            Education|               Higher|... if she burns ...| 01/01/2015|3.1780497764304645|
|Afghanistan|     F|            Education|            Secondary|... if she burns ...| 01/01/2015|3.7013510788987767|
|Afghanistan|     F|            Education|              Primary|... if she burns ...| 01/01/2015|3.7148351498733914|
|Afghanistan|     F|       Marital status| Widowed, divorced...|... if she burns ...| 01/01/2015|3.7148351498733914|
|Afghanistan|     F|           Employment|    Employed for kind|... if she burns ...| 01/01/2015| 4.123105625617661|
|Afghanistan|     F|                  Age|                15-24|

In [9]:
!pip install scikit-learn
from pyspark.sql.functions import col
import seaborn as sns
from pyspark.sql.types import FloatType,IntegerType
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

df = df.withColumn("Value", col("Value").cast(FloatType()))
df1 = df.toPandas()

X_Country = pd.get_dummies(df1['Country'], drop_first=True, dtype=int)
X_Gender = pd.get_dummies(df1['Gender'], drop_first=True, dtype=int)
X_dq = pd.get_dummies(df1['Demographics Question'], drop_first=True, dtype=int)
X_dr = pd.get_dummies(df1['Demographics Response'], drop_first=True, dtype=int)
X_Question = pd.get_dummies(df1['Question'], drop_first=True, dtype=int)
X_sy = pd.get_dummies(df1['Survey Year'], drop_first=True, dtype=int)

df_encoded = pd.concat([df1, X_Country, X_Gender,X_dq,X_dr,X_Question,X_sy], axis=1)

df_encoded.drop(['Country', 'Gender','Demographics Question','Demographics Response','Question','Survey Year'], axis=1, inplace=True)

y = df_encoded['Value']

X = df_encoded.drop('Value', axis=1)
X = sm.add_constant(X)  


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# MLPRegressor
nn_model = MLPRegressor(hidden_layer_sizes=(100, 50), random_state=42)
nn_model.fit(X_train, y_train)
nn_predictions = nn_model.predict(X_test)

# LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)


rf_rmse = mean_squared_error(y_test, rf_predictions, squared=False)
nn_rmse = mean_squared_error(y_test, nn_predictions, squared=False)
lr_rmse = mean_squared_error(y_test, lr_predictions, squared=False)


print(f"RandomForestRegressor RMSE: {rf_rmse}")
print(f"MLPRegressor RMSE: {nn_rmse}")
print(f"LinearRegression RMSE: {lr_rmse}")


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting joblib>=1.1.1
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 KB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.3.1 threadpoolctl-3.2.0
RandomForestRegressor RMSE: 0.5735223343369803
MLPRegressor RMSE: 0.2598672509816248
LinearRegression RMSE: 0.8082320470140498


In [3]:
df2 = spark.read.csv('dataset4/part-00000-e8fbff2b-4af5-4659-8ee9-feb16b3e44b4-c000.csv', header=True)
df2.describe().show()

df3 = spark.read.csv('dataset3/part-00000-97be97e5-9192-457c-86ac-38f5a8a3b2aa-c000.csv', header=True)
df3.describe().show()

                                                                                

+-------+-----------+------+------------------+-------------------+--------------------+
|summary|FECHA HECHO|GENERO|          CANTIDAD|    FECHA HECHO_STR|         Weapon Used|
+-------+-----------+------+------------------+-------------------+--------------------+
|  count|     466679|466679|            466679|             466679|              466679|
|   mean|       null|  null|1.5416313997415783|               null|                null|
| stddev|       null|  null|1.6656822733680472|               null|                null|
|    min| 2015-01-01|     F|                 1|2015-01-01 00:00:00|No Weapon or Subs...|
|    max| 2023-02-28|     M|                 9|2023-02-28 00:00:00|Using weapons and...|
+-------+-----------+------+------------------+-------------------+--------------------+



[Stage 5:>                                                          (0 + 1) / 1]

+-------+-----------+------------------+------+--------------------+-----------------------+
|summary|FECHA HECHO|          CANTIDAD|GENERO|         Weapon Used|Total domestic violence|
+-------+-----------+------------------+------+--------------------+-----------------------+
|  count|      56832|             56832| 56832|               56832|                  56832|
|   mean|       null|3.9417053772522523|  null|                null|      8.211553350225225|
| stddev|       null|3.6173151565087447|  null|                null|     15.789425843555712|
|    min| 2015-01-01|                 1|     F|No Weapon or Subs...|                    1.0|
|    max| 2023-02-28|                 9|     M|Using weapons and...|                   99.0|
+-------+-----------+------------------+------+--------------------+-----------------------+



                                                                                

In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col

df3 = df3.withColumn("CANTIDAD", col("CANTIDAD").cast(IntegerType()))
data=df3

gender_indexer = StringIndexer(inputCol="GENERO", outputCol="GENEROIndex")
gender_encoder = OneHotEncoder(inputCol="GENEROIndex", outputCol="GENEROVec")
wp_indexer = StringIndexer(inputCol="Weapon Used", outputCol="WeaponUsedIndex")
wp_encoder = OneHotEncoder(inputCol="WeaponUsedIndex", outputCol="WeaponUsedVec")

preprocessing_stages = [gender_indexer, gender_encoder, wp_indexer, wp_encoder]
preprocessing_pipeline = Pipeline(stages=preprocessing_stages)

preprocessing_model = preprocessing_pipeline.fit(data)
preprocessed_data = preprocessing_model.transform(data)

feature_cols = ["GENEROVec", "CANTIDAD", "WeaponUsedVec"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
assembled_data = assembler.transform(preprocessed_data)


# create K-Means model
kmeans = KMeans(k=4,featuresCol="features", predictionCol="cluster")
kmeans_model = kmeans.fit(assembled_data)
clustered_data = kmeans_model.transform(assembled_data)

#silhouette_score
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator(predictionCol="cluster", featuresCol="features")
silhouette_score = evaluator.evaluate(clustered_data)
print("Silhouette Score:", silhouette_score)

[Stage 124:>                                                        (0 + 1) / 1]                                                                                

Silhouette Score: 0.43348958768511714


In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col

df2 = df2.withColumn("CANTIDAD", col("CANTIDAD").cast(IntegerType()))
data=df2

gender_indexer = StringIndexer(inputCol="GENERO", outputCol="GENEROIndex")
gender_encoder = OneHotEncoder(inputCol="GENEROIndex", outputCol="GENEROVec")
wp_indexer = StringIndexer(inputCol="Weapon Used", outputCol="WeaponUsedIndex")
wp_encoder = OneHotEncoder(inputCol="WeaponUsedIndex", outputCol="WeaponUsedVec")

preprocessing_stages = [gender_indexer, gender_encoder, wp_indexer, wp_encoder]
preprocessing_pipeline = Pipeline(stages=preprocessing_stages)

preprocessing_model = preprocessing_pipeline.fit(data)
preprocessed_data = preprocessing_model.transform(data)

feature_cols = ["GENEROVec", "CANTIDAD", "WeaponUsedVec"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
assembled_data = assembler.transform(preprocessed_data)


# create K-Means model
kmeans = KMeans(k=4,featuresCol="features", predictionCol="cluster")
kmeans_model = kmeans.fit(assembled_data)
clustered_data = kmeans_model.transform(assembled_data)

#silhouette_score
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator(predictionCol="cluster", featuresCol="features")
silhouette_score = evaluator.evaluate(clustered_data)
print("Silhouette Score:", silhouette_score)




Silhouette Score: 0.5894037254447393


                                                                                