In [10]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("LoanPrediction") \
    .getOrCreate()


In [11]:
df = spark.read.csv("/DATA/loan.csv", header=True, inferSchema=True)


                                                                                

In [12]:
df.na.drop().count()

# Show işlemini kullanarak DataFrame'i daha düzenli görüntüleme
df.show(truncate=False, vertical=True)





-RECORD 0-----------------------------------------------------------------
 id                                         | null                        
 member_id                                  | null                        
 loan_amnt                                  | 2500                        
 funded_amnt                                | 2500                        
 funded_amnt_inv                            | 2500.0                      
 term                                       |  36 months                  
 int_rate                                   | 13.56                       
 installment                                | 84.92                       
 grade                                      | C                           
 sub_grade                                  | C1                          
 emp_title                                  | Chef                        
 emp_length                                 | 10+ years                   
 home_ownership          

                                                                                

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit

# Spark oturumu başlatma
spark = SparkSession.builder.appName("MissingValuesPercentage").getOrCreate()

# Eksik değer yüzdesini hesaplama ve sütunları seçme
threshold_percentage = 20  # %20'nin altındaki sütunları seçme eşik değeri
selected_columns = [c for c in df.columns if (df.filter(col(c).isNotNull()).count() / df.count()) * 100 >= threshold_percentage]

# Yeni DataFrame'i oluşturma
new_df = df.select(selected_columns)

# Sonuçları yazdırma
print("Total features before:", len(df.columns))
print("Total features now:", len(new_df.columns))



Total features before: 145
Total features now: 106


                                                                                

In [14]:
new_df.show(truncate=False, vertical=True)

-RECORD 0-----------------------------------------------------
 loan_amnt                      | 2500                        
 funded_amnt                    | 2500                        
 funded_amnt_inv                | 2500.0                      
 term                           |  36 months                  
 int_rate                       | 13.56                       
 installment                    | 84.92                       
 grade                          | C                           
 sub_grade                      | C1                          
 emp_title                      | Chef                        
 emp_length                     | 10+ years                   
 home_ownership                 | RENT                        
 annual_inc                     | 55000                       
 verification_status            | Not Verified                
 issue_d                        | Dec-2018                    
 loan_status                    | Current              

In [15]:
# Spark oturumu başlatma
spark = SparkSession.builder.appName("SelectFeatures").getOrCreate()

# Seçilen sütunları belirleme
selected_features = ['term', 'int_rate', 'installment', 'loan_status']

# Sadece seçilen sütunları içeren yeni bir Spark DataFrame oluşturma
df_selected = new_df.select(selected_features)

# Seçilen sütunların sayısını yazdırma
print("Number of selected features:", len(selected_features))

Number of selected features: 4


In [16]:
# Sonuçları gösterme
df_selected.show()

+----------+--------+-----------+-----------+
|      term|int_rate|installment|loan_status|
+----------+--------+-----------+-----------+
| 36 months|   13.56|      84.92|    Current|
| 60 months|   18.94|     777.23|    Current|
| 36 months|   17.97|     180.69|    Current|
| 36 months|   18.94|     146.51|    Current|
| 60 months|   16.14|     731.78|    Current|
| 36 months|   15.02|     192.45|    Current|
| 36 months|   17.97|      72.28|    Current|
| 36 months|   13.56|     203.79|    Current|
| 36 months|   17.97|     180.69|    Current|
| 36 months|   14.47|     206.44|    Current|
| 36 months|   22.35|     211.05|    Current|
| 60 months|   11.31|     613.13|    Current|
| 36 months|    8.19|     351.95|    Current|
| 36 months|   17.97|      234.9|    Current|
| 60 months|   12.98|     500.35|    Current|
| 36 months|   16.14|      123.3|    Current|
| 36 months|   12.98|      235.8|    Current|
| 60 months|   16.91|     620.11|    Current|
| 60 months|   20.89|     431.87| 

In [18]:
from pyspark.sql import SparkSession

# Spark oturumu başlatma
spark = SparkSession.builder.appName("DropNullRows").getOrCreate()


# NaN içeren satırları silme
df = df_selected.na.drop()

# Sonuçları gösterme
df.show()

+----------+--------+-----------+-----------+
|      term|int_rate|installment|loan_status|
+----------+--------+-----------+-----------+
| 36 months|   13.56|      84.92|    Current|
| 60 months|   18.94|     777.23|    Current|
| 36 months|   17.97|     180.69|    Current|
| 36 months|   18.94|     146.51|    Current|
| 60 months|   16.14|     731.78|    Current|
| 36 months|   15.02|     192.45|    Current|
| 36 months|   17.97|      72.28|    Current|
| 36 months|   13.56|     203.79|    Current|
| 36 months|   17.97|     180.69|    Current|
| 36 months|   14.47|     206.44|    Current|
| 36 months|   22.35|     211.05|    Current|
| 60 months|   11.31|     613.13|    Current|
| 36 months|    8.19|     351.95|    Current|
| 36 months|   17.97|      234.9|    Current|
| 60 months|   12.98|     500.35|    Current|
| 36 months|   16.14|      123.3|    Current|
| 36 months|   12.98|      235.8|    Current|
| 60 months|   16.91|     620.11|    Current|
| 60 months|   20.89|     431.87| 

In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import trim, regexp_replace
from pyspark.sql.types import IntegerType

# Spark oturumu başlatma
spark = SparkSession.builder.appName("TermColumnTransformation").getOrCreate()


# Term sütununu dönüştürme
df = df.withColumn("term", trim(regexp_replace("term", " months", "")).cast(IntegerType()))

# Sonuçları gösterme
df.show()


+----+--------+-----------+-----------+
|term|int_rate|installment|loan_status|
+----+--------+-----------+-----------+
|  36|   13.56|      84.92|    Current|
|  60|   18.94|     777.23|    Current|
|  36|   17.97|     180.69|    Current|
|  36|   18.94|     146.51|    Current|
|  60|   16.14|     731.78|    Current|
|  36|   15.02|     192.45|    Current|
|  36|   17.97|      72.28|    Current|
|  36|   13.56|     203.79|    Current|
|  36|   17.97|     180.69|    Current|
|  36|   14.47|     206.44|    Current|
|  36|   22.35|     211.05|    Current|
|  60|   11.31|     613.13|    Current|
|  36|    8.19|     351.95|    Current|
|  36|   17.97|      234.9|    Current|
|  60|   12.98|     500.35|    Current|
|  36|   16.14|      123.3|    Current|
|  36|   12.98|      235.8|    Current|
|  60|   16.91|     620.11|    Current|
|  60|   20.89|     431.87|    Current|
|  60|   14.47|     305.67|    Current|
+----+--------+-----------+-----------+
only showing top 20 rows



In [20]:
from pyspark.sql.functions import when, expr

# Spark oturumu başlatma
spark = SparkSession.builder.appName("LabelEncoding").getOrCreate()


df = df.withColumn("loan_status", when(df["loan_status"] == "Fully Paid", 0)
                                    .when(df["loan_status"] == "Charged Off", 1))

# Sonuçları gösterme
df.show()

+----+--------+-----------+-----------+
|term|int_rate|installment|loan_status|
+----+--------+-----------+-----------+
|  36|   13.56|      84.92|       null|
|  60|   18.94|     777.23|       null|
|  36|   17.97|     180.69|       null|
|  36|   18.94|     146.51|       null|
|  60|   16.14|     731.78|       null|
|  36|   15.02|     192.45|       null|
|  36|   17.97|      72.28|       null|
|  36|   13.56|     203.79|       null|
|  36|   17.97|     180.69|       null|
|  36|   14.47|     206.44|       null|
|  36|   22.35|     211.05|       null|
|  60|   11.31|     613.13|       null|
|  36|    8.19|     351.95|       null|
|  36|   17.97|      234.9|       null|
|  60|   12.98|     500.35|       null|
|  36|   16.14|      123.3|       null|
|  36|   12.98|      235.8|       null|
|  60|   16.91|     620.11|       null|
|  60|   20.89|     431.87|       null|
|  60|   14.47|     305.67|       null|
+----+--------+-----------+-----------+
only showing top 20 rows



In [21]:
from pyspark.sql import SparkSession

# Spark oturumu başlatma
spark = SparkSession.builder.appName("DropNullRows").getOrCreate()


# NaN içeren satırları silme
df = df.na.drop()

# Sonuçları gösterme
df.show()

+----+--------+-----------+-----------+
|term|int_rate|installment|loan_status|
+----+--------+-----------+-----------+
|  36|   22.35|    1151.16|          0|
|  60|   16.14|     975.71|          0|
|  36|    7.56|     622.68|          0|
|  36|   11.31|     147.99|          0|
|  36|   27.27|     345.18|          0|
|  60|   17.97|     507.55|          0|
|  36|   11.31|     217.05|          0|
|  36|   13.56|      84.92|          0|
|  36|   17.97|     144.55|          0|
|  36|    8.19|      84.85|          0|
|  36|    23.4|      38.92|          0|
|  36|    7.56|     467.01|          0|
|  36|    7.56|     249.08|          0|
|  36|    7.56|     155.67|          0|
|  36|    7.56|     403.19|          0|
|  60|   12.98|     237.67|          0|
|  60|   26.31|     602.49|          0|
|  60|   12.98|     227.43|          0|
|  36|   10.33|     941.87|          0|
|  36|   13.56|      33.97|          0|
+----+--------+-----------+-----------+
only showing top 20 rows



In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Spark oturumunu başlatın
spark = SparkSession.builder.appName("DataTypeCheck").getOrCreate()


# Veri tiplerini kontrol edin
df.printSchema()

# Özellikle belirli bir sütunun veri türünü kontrol edin (örneğin "term")
term_data_type = df.schema["term"].dataType
print(f"Veri Türü (term): {term_data_type}")

int_rate_data_type = df.schema["int_rate"].dataType
print(f"Veri Türü (int_rate): {int_rate_data_type}")

installment_data_type = df.schema["installment"].dataType
print(f"Veri Türü (installment): {installment_data_type}")

loan_status_data_type = df.schema["label"].dataType
print(f"Veri Türü (label): {loan_status_data_type}")


root
 |-- term: integer (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- label: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- features1: vector (nullable = true)

Veri Türü (term): IntegerType
Veri Türü (int_rate): DoubleType
Veri Türü (installment): DoubleType
Veri Türü (label): IntegerType


In [30]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand

# Spark oturumunu başlatın
spark = SparkSession.builder.appName("SampleData").getOrCreate()


# Veri kümesini yüzde 20'sine küçültün
df = df.sample(fraction=0.2, seed=42)

# Örnekleme sonucunu görüntüleyin
df.show()


+----+--------+-----------+-----+-------------------+-------------------+
|term|int_rate|installment|label|           features|          features1|
+----+--------+-----------+-----+-------------------+-------------------+
|  36|   13.56|      84.92|    0| [36.0,13.56,84.92]| [36.0,13.56,84.92]|
|  60|   26.31|     602.49|    0|[60.0,26.31,602.49]|[60.0,26.31,602.49]|
|  36|   10.33|     941.87|    0|[36.0,10.33,941.87]|[36.0,10.33,941.87]|
|  36|    11.8|     298.07|    0| [36.0,11.8,298.07]| [36.0,11.8,298.07]|
|  60|    8.19|     325.88|    0| [60.0,8.19,325.88]| [60.0,8.19,325.88]|
|  36|    8.19|     314.25|    0| [36.0,8.19,314.25]| [36.0,8.19,314.25]|
|  36|   19.92|     111.37|    0|[36.0,19.92,111.37]|[36.0,19.92,111.37]|
|  60|   10.33|     256.92|    0|[60.0,10.33,256.92]|[60.0,10.33,256.92]|
|  60|   15.02|     476.01|    0|[60.0,15.02,476.01]|[60.0,15.02,476.01]|
|  36|   12.98|     370.53|    0|[36.0,12.98,370.53]|[36.0,12.98,370.53]|
|  60|    11.8|     885.75|    0| [60.

                                                                                

In [32]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MyApp") \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

In [37]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder.appName("LoanStatusPrediction").getOrCreate()

# feature_columns = ["term", "int_rate", "installment"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

# Veri setini eğitim ve test olarak bölelim
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Yapay sinir ağı modelini tanımlayın
layers = [len(feature_columns), 5, 2]  # Giriş, Gizli Katman, Çıkış
classifier = MultilayerPerceptronClassifier(layers=layers, seed=42)

# Modeli eğitin
model = classifier.fit(train_data)

# Test verisi üzerinde tahminler yapın
predictions = model.transform(test_data)

# Sınıflandırma performansını değerlendirin
evaluator = MulticlassClassificationEvaluator(labelCol="loan_status", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Model Accuracy:", accuracy)




Model Accuracy: 0.7987461297332641


