DATA_FROM_APACHE-SPARK

In [3]:
import pyspark.sql
from pyspark.sql import SparkSession

# create a SparkSession
spark = SparkSession.builder.appName('DataAnalysis').getOrCreate()

DATA_RETRIEVAL

In [4]:
# CSV IN SPARK-DATAFRAME

df = spark.read.csv("C:/GITHUB_PROJECTS/StudDA_ML/student.csv", header=True, inferSchema=True)
df = df.show(10)

+-------+------+---+------+----------+---------+-----------+------------+----------+--------------+--------+--------+----------+---+---+---+-----+-----------+-------+------+-----------+---+
|student|gender|age|region|traveltime|studytime|performance|parentincome|activities|educatedparent|internet|freetime|attendance| G1| G2| G3|total| percentage|percent|result|shortlisted|inc|
+-------+------+---+------+----------+---------+-----------+------------+----------+--------------+--------+--------+----------+---+---+---+-----+-----------+-------+------+-----------+---+
|      1|female| 18| urban|         2|        2|         60|         450|         0|             1|       1|       3|        63|  5|  6|  6|   18|       30.0|   30.0|     1|          0|100|
|      2|female| 17| urban|         1|        1|         40|         100|         0|             0|       0|       3|        83|  5|  5|  6|   16|26.66666667|   26.0|     0|          1| 65|
|      3|female| 15| urban|         1|        2|  

DATA_PREPROCESSING

In [None]:
df.printSchema()

In [None]:
from pyspark.sql.functions import isnan, when, count, col

# Count the number of missing values in each column
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [None]:
# Drop rows with missing values
df = df.na.drop()

# Fill missing values with a specific value
df = df.fillna({'age': 0})


In [None]:
# Grouping and Aggregating: 'groupBy()' & 'avg(), count(), sum()'
# avg
df.groupBy('gender').agg({'age': 'avg'}).show()

In [None]:
# count
df.groupBy('gender').count().show()

EXPLORATORU_DATA_ANALYSIS (EDA)

In [None]:
from pyspark.sql.functions import col
import plotly.express as px

# Group by age and count the number of students
grouped_df = df.groupBy("age").count()

# Convert to Pandas DataFrame
pandas_df = grouped_df.toPandas()

# Create bar plot with Plotly
fig = px.bar(pandas_df, x='age', y='count', title='Student Count by Age')
fig.show()

MACHINE LEARNING
LOGISTIC_CLASSIFICATION_MODEL

In [None]:
# import necessary packages
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# select relevant columns
data = data.select("G1", "G2", "G3", "pass")

# create feature vector
assembler = VectorAssembler(inputCols=["G1", "G2", "G3"], outputCol="features")
data = assembler.transform(data)

In [None]:
# split data into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# train logistic regression model
lr = LogisticRegression(featuresCol="features", labelCol="pass", maxIter=10, regParam=0.01)
lr_model = lr.fit(train_data)

# make predictions on test data
predictions = lr_model.transform(test_data)

In [None]:
# evaluate model performance
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="pass", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy: {}".format(accuracy))

# show confusion matrix
predictions.groupBy("pass", "prediction").count().show()

In [None]:
# make predictions on new data
new_data = spark.read.csv("C:/nasoindiadev/interns-20222023/dataanalytics/code/data/new_stud.csv", header=True, inferSchema=True)

# select relevant columns
new_data = new_data.select("G1", "G2", "G3")

# create feature vector
new_data = assembler.transform(new_data)

# make predictions
new_predictions = lr_model.transform(new_data)

# show predictions with all columns
new_predictions.select("*").show()