In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("USGS").getOrCreate()

In [None]:
data=spark.read.format("csv").option("header","true").option("inferSchema","true").load("dbfs:/FileStore/shared_uploads/kishoresai.ganes@tigeranalytics.com/usgs_main.csv")

In [40]:
data.describe()

DataFrame[summary: string, latitude: string, longitude: string, depth: string, mag: string, magType: string, nst: string, gap: string, dmin: string, rms: string, net: string, id: string, place: string, type: string, horizontalError: string, depthError: string, magError: string, magNst: string, status: string, locationSource: string, magSource: string]


In [41]:
data.groupby("type").count().show()

+------------------+-----+
|              type|count|
+------------------+-----+
|         explosion|  376|
|         ice quake|   11|
|      quarry blast|  665|
|       other event|    5|
|        earthquake|74752|
|chemical explosion|    1|
+------------------+-----+


In [42]:
data.printSchema()

root
 |-- time: timestamp (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- depth: double (nullable = true)
 |-- mag: double (nullable = true)
 |-- magType: string (nullable = true)
 |-- nst: double (nullable = true)
 |-- gap: double (nullable = true)
 |-- dmin: double (nullable = true)
 |-- rms: double (nullable = true)
 |-- net: string (nullable = true)
 |-- id: string (nullable = true)
 |-- updated: timestamp (nullable = true)
 |-- place: string (nullable = true)
 |-- type: string (nullable = true)
 |-- horizontalError: double (nullable = true)
 |-- depthError: double (nullable = true)
 |-- magError: double (nullable = true)
 |-- magNst: double (nullable = true)
 |-- status: string (nullable = true)
 |-- locationSource: string (nullable = true)
 |-- magSource: string (nullable = true)


In [43]:
data.show(5,False)

+-----------------------+----------+------------+-----+----+-------+----+-----+--------+----+---+------------+-----------------------+----------------------------------+----------+---------------+----------+--------+------+---------+--------------+---------+
|time                   |latitude  |longitude   |depth|mag |magType|nst |gap  |dmin    |rms |net|id          |updated                |place                             |type      |horizontalError|depthError|magError|magNst|status   |locationSource|magSource|
+-----------------------+----------+------------+-----+----+-------+----+-----+--------+----+---+------------+-----------------------+----------------------------------+----------+---------------+----------+--------+------+---------+--------------+---------+
|2022-03-04 21:28:02.44 |38.7596664|-122.7196655|1.61 |1.24|md     |14.0|115.0|0.004494|0.04|nc |nc73701241  |2022-03-04 21:29:36.906|3km SW of Anderson Springs, CA    |earthquake|0.3            |0.36      |0.1     |5.0   |

In [44]:
data=data.na.drop()

In [45]:
data=data.drop("place","time","magSource")

In [46]:
data=data.withColumnRenamed("updated","time")
data=data.withColumnRenamed("locationSource","source")

In [47]:
data.show()

+-----------------+-------------------+-----+----+-------+----+-----+--------+----+---+----------+--------------------+------------+---------------+----------+-----------------+------+---------+------+
|         latitude|          longitude|depth| mag|magType| nst|  gap|    dmin| rms|net|        id|                time|        type|horizontalError|depthError|         magError|magNst|   status|source|
+-----------------+-------------------+-----+----+-------+----+-----+--------+----+---+----------+--------------------+------------+---------------+----------+-----------------+------+---------+------+
|       38.7596664|       -122.7196655| 1.61|1.24|     md|14.0|115.0|0.004494|0.04| nc|nc73701241|2022-03-04 21:29:...|  earthquake|            0.3|      0.36|              0.1|   5.0|automatic|    nc|
|       38.8338318|       -122.8154984| 1.82|1.13|     md|22.0| 66.0| 0.01632|0.02| nc|nc73701236|2022-03-04 21:29:...|  earthquake|           0.19|      0.53|             0.14|   4.0|automati

In [48]:
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer


In [49]:
l=["magType","net","type","source"]
#indexer=StringIndexer(inputCol=["magType","net","type","source"],outputCol=["magType1","net1","type1","source1"],handleInvalid="keep",stringOrderType="frequencyDesc")



indexer = [
StringIndexer(inputCol=c, outputCol="{0}1".format(c))
for c in l
]

In [50]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StandardScaler

In [51]:
from pyspark.sql.functions import unix_timestamp

# Convert the "time" column to the number of seconds since the Unix epoch
data = data.withColumn("time", unix_timestamp(data["time"]) / 1000)

In [59]:
va=VectorAssembler(inputCols=["latitude","longitude","depth","magType1","net1","mag","nst","time"],outputCol="features")


In [69]:
pipeline = Pipeline(stages=indexer + [va])
df_tfm=pipeline.fit(data).transform(data)

In [68]:
train, test = df_tfm.randomSplit([0.7, 0.3])

In [70]:
num_rows_train = train.count()
num_cols_train = len(train.columns)
print("Training:",num_rows_train,"x",num_cols_train)

Training: 25710 x 24


In [71]:
num_rows_test = test.count()
num_cols_test = len(test.columns)
print("Training:",num_rows_test,"x",num_cols_test)

Training: 11063 x 24


In [72]:
df_tfm.columns

['latitude', 'longitude', 'depth', 'mag', 'magType', 'nst', 'gap', 'dmin', 'rms', 'net', 'id', 'time', 'type', 'horizontalError', 'depthError', 'magError', 'magNst', 'status', 'source', 'magType1', 'net1', 'type1', 'source1', 'features']


In [73]:
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')
scaler_model = scaler.fit(df_tfm)

In [74]:
train=scaler_model.transform(df_tfm)

In [75]:
test=scaler_model.transform(test)

In [76]:
train.show(3,False )

+----------+------------+-----+----+-------+----+-----+--------+----+---+----------+-----------+----------+---------------+----------+--------+------+---------+------+--------+----+-----+-------+------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|latitude  |longitude   |depth|mag |magType|nst |gap  |dmin    |rms |net|id        |time       |type      |horizontalError|depthError|magError|magNst|status   |source|magType1|net1|type1|source1|features                                                    |scaled_features                                                                                                                                            |
+----------+------------+-----+----+-------+----+-----+--------+----+---+----------+-----------+----------+---------------+----------+--------+------+---------+------+-------

In [77]:
from pyspark.ml.classification import LogisticRegression

In [78]:
log=LogisticRegression(featuresCol='scaled_features',labelCol='type1')

In [79]:
lrmodel=log.fit(train)

In [80]:
prediction=lrmodel.transform(test)

In [81]:
test.show(3)

+--------+---------+-----+----+-------+----+-----+------+----+---+------------+-----------+----------+---------------+----------+--------+------+--------+------+--------+----+-----+-------+--------------------+--------------------+
|latitude|longitude|depth| mag|magType| nst|  gap|  dmin| rms|net|          id|       time|      type|horizontalError|depthError|magError|magNst|  status|source|magType1|net1|type1|source1|            features|     scaled_features|
+--------+---------+-----+----+-------+----+-----+------+----+---+------------+-----------+----------+---------------+----------+--------+------+--------+------+--------+----+-----+-------+--------------------+--------------------+
| 17.8638| -68.5596| 80.0|3.57|     md|18.0|207.0|0.6741|0.59| pr|pr2022087000|1648506.341|earthquake|           3.82|      4.94|    0.25|  11.0|reviewed|    pr|     1.0| 6.0|  0.0|    6.0|[17.8638,-68.5596...|[1.29560305056788...|
| 17.9123| -66.9085| 10.0| 3.3|     md|25.0|200.0|0.0693|0.16| pr|pr2022

In [82]:
from pyspark.ml.evaluation import RegressionEvaluator

In [83]:
test.groupby("type").count().show()

+------------------+-----+
|              type|count|
+------------------+-----+
|         explosion|  109|
|      quarry blast|  151|
|       other event|    2|
|        earthquake|10800|
|chemical explosion|    1|
+------------------+-----+


In [84]:
train.groupby("type").count().show()

+------------------+-----+
|              type|count|
+------------------+-----+
|         explosion|  370|
|      quarry blast|  501|
|       other event|    2|
|        earthquake|35899|
|chemical explosion|    1|
+------------------+-----+


In [85]:
# Use the MulticlassClassificationEvaluator to evaluate the model's accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="type1", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
print("Accuracy:", accuracy)
# Select the "prediction" and "label" columns
predictions_df = prediction.select(["prediction", "type1"])

# Convert the predictions and labels to Pandas dataframes for easier inspection
predictions_pd = predictions_df.toPandas()

# Print the first 10 predictions and their corresponding true labels
print(predictions_pd.head(10))
# Set the hyperparameters for the logistic regression model
lr = LogisticRegression(labelCol='type1', featuresCol='features')

# Fit the model to the training data
lr_model = lr.fit(train)

# Make predictions on the test data
predictions = lr_model.transform(test)
# Save the model to a file
#lr_model.save("logistic_regression_model1")

# Load the saved model
#loaded_model = LogisticRegression.load("/content/logistic_regression_model1")

accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9746000180782789
   prediction  type1
0         0.0    0.0
1         0.0    0.0
2         0.0    0.0
3         0.0    0.0
4         0.0    0.0
5         0.0    0.0
6         0.0    0.0
7         0.0    0.0
8         0.0    0.0
9         0.0    0.0
Accuracy: 0.9746000180782789


In [86]:
from pyspark.ml.classification import RandomForestClassifier

In [87]:
rand=RandomForestClassifier(featuresCol='scaled_features',labelCol='type1')
rmodel=rand.fit(train)

In [88]:
predictionrand=rmodel.transform(test)

In [89]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="type1", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictionrand)
print("Accuracy:", accuracy)
# Select the "prediction" and "label" columns
predictions_df = predictionrand.select(["prediction", "type1"])

# Convert the predictions and labels to Pandas dataframes for easier inspection
predictions_pd = predictions_df.toPandas()

# Print the first 10 predictions and their corresponding true labels
print(predictions_pd.head(10))


Accuracy: 0.98607972521016
   prediction  type1
0         0.0    0.0
1         0.0    0.0
2         0.0    0.0
3         0.0    0.0
4         0.0    0.0
5         0.0    0.0
6         0.0    0.0
7         0.0    0.0
8         0.0    0.0
9         0.0    0.0


In [90]:
# Set the hyperparameters for the logistic regression model
regrand = RandomForestClassifier(labelCol='type1', featuresCol='features',numTrees=100,maxDepth=5)

# Fit the model to the training data
regmodel = regrand.fit(train)

# Make predictions on the test data
predictions = regmodel.transform(test)

accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9864412907891169


In [91]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Define the hyperparameters to tune
hyperparameters = [
    {'regParam': [0.1, 0.01, 0.001], 'elasticNetParam': [0.0, 0.5, 1.0]},
    {'regParam': [0.1, 0.01, 0.001], 'elasticNetParam': [0.0, 0.5, 1.0], 'maxIter': [10, 50, 100]}
]

In [92]:
param_grid = ParamGridBuilder().addGrid(log.regParam, hyperparameters[0]['regParam'])\
                               .addGrid(log.elasticNetParam, hyperparameters[0]['elasticNetParam'])\
                               .build()

In [93]:
cv = CrossValidator(estimator=log, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=2)

In [94]:
model = cv.fit(train)

In [95]:
model.params

[Param(parent='CrossValidatorModel_a964530ec205', name='estimator', doc='estimator to be cross-validated'), Param(parent='CrossValidatorModel_a964530ec205', name='estimatorParamMaps', doc='estimator param maps'), Param(parent='CrossValidatorModel_a964530ec205', name='evaluator', doc='evaluator used to select hyper-parameters that maximize the validator metric'), Param(parent='CrossValidatorModel_a964530ec205', name='seed', doc='random seed.')]


In [96]:
model.bestModel

LogisticRegressionModel: uid = LogisticRegression_1957cd105963, numClasses = 5, numFeatures = 8


In [97]:
predictions = model.transform(test)

accuracy = evaluator.evaluate(predictions)
print("Accuracy: ", accuracy)

Accuracy:  0.9762270631835849
