In [33]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import col, isnan

In [34]:
sparkSession = SparkSession.builder.appName('DataProcessing').getOrCreate() 

In [35]:
df_pyspark = sparkSession.read.csv(
    'activation_fixe.csv', 
    header=True, 
    inferSchema=True,
    sep=';'  
) 

In [36]:
df_pyspark.printSchema() 

root
 |-- ID: string (nullable = true)
 |-- KIT_CODE: string (nullable = true)
 |-- SIM: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- COMPLEMENT: integer (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- PHONE: string (nullable = true)
 |-- PROFESSION: string (nullable = true)
 |-- OFFRE: string (nullable = true)
 |-- DEBIT: string (nullable = true)
 |-- MODE_PAIEMENT: integer (nullable = true)
 |-- BIRTH_DATE: string (nullable = true)
 |-- COUNTRY: integer (nullable = true)
 |-- _c14: string (nullable = true)
 |-- ID_TYPE: integer (nullable = true)
 |-- ENTREPRISE_NAME: string (nullable = true)
 |-- ACTIVITY_DOMAIN: string (nullable = true)
 |-- TRADE_REGISTER_NUMBER: string (nullable = true)
 |-- ADDRESS: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- GOVERNORATE: string (nullable = true)
 |-- POSTAL_CODE: integer (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- FAX: string (null

In [37]:
df_pyspark.show() 

+----------+--------------+----+----+----------+--------------+------------+-----+----------+--------------------+-----+-------------+-------------------+-------+----+-------+---------------+---------------+---------------------+--------------------+----------------+-----------+-----------+--------------------+----+----------------+---------------------+---------+----+-------+------------------+-----------------------+----+-------------------+-----------+-------+-----------------------+-----------------+----------+------+-----------------+------------+-------+--------------------+------+------------+---------------+----+----+-----------+-----------------+-------------+--------------------+------------+-------------+-------------+----+-------------+-------------+----+------------------+------------------+----------------+------------+----+-----+----------+---------------+----+------+------------------------+------------------------+-----------------+--------------------+----------------

In [38]:
df_pyspark.count() 

3917

In [29]:
num_columns = len(df_pyspark.columns)

print(f"Number of columns: {num_columns}")

Number of columns: 84


In [24]:
df_pyspark.show(truncate=False)

+----------+--------------+----+----+----------+--------------+------------+-----+----------+-------------------------------+-----+-------------+-------------------+-------+----+-------+---------------+---------------+---------------------+------------------------------------+----------------+-----------+-----------+---------------------------+----+----------------+---------------------+---------+----+-------+------------------+-----------------------+----+-------------------+-----------+-------+-----------------------+-----------------+----------+------+-----------------+------------+-------+------------------------------------+------+------------+---------------+----+----+-----------+-----------------+-------------+-----------------------------+------------+-------------+-------------+----+-------------+-------------+----+------------------+------------------+----------------+------------+----+-----+----------+---------------+----+------+------------------------+---------------------

In [25]:
null_columns = []
for column in df_pyspark.columns:
    
    null_count = df_pyspark.filter(col(column).isNull() | isnan(col(column))).count()
    if null_count == df_pyspark.count():
        null_columns.append(column)

num_null_columns = len(null_columns)

In [30]:
print(f"Number of columns with all null values: {num_null_columns}")
print(f"Columns with all null values: {null_columns}")

Number of columns with all null values: 20
Columns with all null values: ['SIM', '_c3', 'PHONE', 'PROFESSION', '_c14', 'ENTREPRISE_NAME', 'ACTIVITY_DOMAIN', 'TRADE_REGISTER_NUMBER', 'FAX', '_c32', 'NATIONALITY', 'MESSAGE', 'CONTRACT_RECEPTION_DATE', 'PAYMENT_REFERENCE', 'PACKAGE_TYPE', 'IP', '_c48', 'NUM_SERIE_CLE', 'DATE_MAJ_FLAG', 'PIN']


In [31]:
column_names = df_pyspark.columns
print("Column names:")
for col_name in column_names:
    print(col_name)

Column names:
ID
KIT_CODE
SIM
_c3
COMPLEMENT
FIRST_NAME
LAST_NAME
PHONE
PROFESSION
OFFRE
DEBIT
MODE_PAIEMENT
BIRTH_DATE
COUNTRY
_c14
ID_TYPE
ENTREPRISE_NAME
ACTIVITY_DOMAIN
TRADE_REGISTER_NUMBER
ADDRESS
CITY
GOVERNORATE
POSTAL_CODE
EMAIL
FAX
OPERATION_STATUS
OPERATION_STATUS_DATE
DEALER_ID
TYPE
USER_ID
TRANSACTION_STATUS
TRANSACTION_STATUS_DATE
_c32
CREATION_DATE
NATIONALITY
MESSAGE
CONTRACT_RECEPTION_DATE
CONTRACT_RECIEVED
COMMISSION
AMOUNT
PAYMENT_REFERENCE
COMMISSIONED
ANOMALY
ID_BSCS_TRANSACTION
TMCODE
PACKAGE_TYPE
ACTIVATION_TYPE
IP
_c48
CO_ID
ADSL_LOGIN
ADSL_PASSWORD
SIP_LOGIN
SIP_PASSWORD
DELEGATION
NUM_SERIE_CLE
FLAG
DATE_MAJ_FLAG
FRAIS_RACCORD
PIN
LATITUDE
LONGITUDE
DATE_RENDEZ_VOUS
NOM_IMMEUBLE
BLOC
ETAGE
NUM_APPART
ONT_SN
ONT
LEVEL4
TYPE_FILE_IDENTITE_RSCAN
TYPE_FILE_IDENTITE_VSCAN
TYPE_FILE_CONTRAT
TYPE_FILE_ABONNEMENT
TYPE_FILE_CONDITION
CASEID
CASE_DATE
SNCODE
BTS
FLAG_BTS
DATE_UPDATE_BTS
REPETITEUR
REP_FLAG
SLA


In [40]:
columns_to_drop = ["_c3", "_c14", "_c32", "_c48"]
df_pyspark = df_pyspark.drop(*columns_to_drop)

In [41]:
print("Schema after dropping unnamed columns:")
df_pyspark.printSchema()

Schema after dropping columns:
root
 |-- ID: string (nullable = true)
 |-- KIT_CODE: string (nullable = true)
 |-- SIM: string (nullable = true)
 |-- COMPLEMENT: integer (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- PHONE: string (nullable = true)
 |-- PROFESSION: string (nullable = true)
 |-- OFFRE: string (nullable = true)
 |-- DEBIT: string (nullable = true)
 |-- MODE_PAIEMENT: integer (nullable = true)
 |-- BIRTH_DATE: string (nullable = true)
 |-- COUNTRY: integer (nullable = true)
 |-- ID_TYPE: integer (nullable = true)
 |-- ENTREPRISE_NAME: string (nullable = true)
 |-- ACTIVITY_DOMAIN: string (nullable = true)
 |-- TRADE_REGISTER_NUMBER: string (nullable = true)
 |-- ADDRESS: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- GOVERNORATE: string (nullable = true)
 |-- POSTAL_CODE: integer (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- FAX: string (nullable = true)
 |-- OPERATION_STATUS: inte