import data

In [3]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

sessions = pd.read_csv('charging_sessions.csv', skipinitialspace=True)
stations = pd.read_csv('charging_stations.csv', skipinitialspace=True)
customers = pd.read_csv('customers.csv', skipinitialspace=True)
districts = pd.read_csv('districts.csv', skipinitialspace=True)




initial exploration

In [10]:
print(districts.columns)
print(districts.shape)
districts.head()
districts.info()
print(districts.describe())

Int64Index([0, 1, 2, 3, 4], dtype='int64')
(19, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       19 non-null     object
 1   1       19 non-null     object
 2   2       19 non-null     object
 3   3       19 non-null     object
 4   4       19 non-null     object
dtypes: object(5)
memory usage: 892.0+ bytes
                    0        1     2   3    4
count              19       19    19  19   19
unique             19        4    17  17   17
top     district_name  Low-Mid  1800  55  360
freq                1        9     2   2    2


identifying unknown values

In [22]:
print(sessions['customer_id'].value_counts(dropna=False))

C1537    32
C5499    30
C4803    30
C1113    29
C3387    28
         ..
C1346     3
C3659     3
C5522     3
C4431     2
C2577     2
Name: customer_id, Length: 8000, dtype: int64


cleaning up data

In [24]:
# replacing missing values with 'Unknown'
sessions.fillna('Unknown', inplace=True)
stations.fillna('Unknown', inplace=True)
customers.fillna('Unknown', inplace=True)
districts.fillna('Unknown', inplace=True)

print(sessions.isna().sum())
print(stations.isna().sum())
print(customers.isna().sum())
print(districts.isna().sum())


session_id            0
customer_id           0
station_id            0
session_start_time    0
kwh_charged           0
cost_per_kwh          0
total_cost            0
dtype: int64
station_id       0
district_name    0
income_tier      0
operator_name    0
plugs_count      0
latitude         0
longitude        0
dtype: int64
customer_id             0
income_tier             0
car_model               0
battery_capacity_kwh    0
dtype: int64
district_name                  0
income_tier                    0
projected_evs                  0
projected_plugs                0
projected_regular_customers    0
dtype: int64


categorical/binary columns

In [28]:
#finding any weird category values
for col in sessions.select_dtypes(include='object').columns:
    print(f"Column: {col}")
    print(sessions[col].value_counts())
    print()

#finding any weird category values


Column: session_id
S00001     1
S75998     1
S76009     1
S76008     1
S76007     1
          ..
S37996     1
S37995     1
S37994     1
S37993     1
S114000    1
Name: session_id, Length: 114000, dtype: int64

Column: customer_id
C1537    32
C5499    30
C4803    30
C1113    29
C3387    28
         ..
C1346     3
C3659     3
C5522     3
C4431     2
C2577     2
Name: customer_id, Length: 8000, dtype: int64

Column: session_start_time
2025-10-10 09:40:00    8
2025-10-26 09:32:00    8
2025-08-14 14:43:00    7
2025-09-17 12:39:00    7
2025-10-19 12:40:00    7
                      ..
2025-09-27 07:00:00    1
2025-09-01 10:56:00    1
2025-11-07 15:44:00    1
2025-09-11 13:37:00    1
2025-09-22 21:36:00    1
Name: session_start_time, Length: 72084, dtype: int64



removing duplicates

In [29]:
sessionsDuplicates = sessions.duplicated().sum()
stationsDuplicates = stations.duplicated().sum()
customersDuplicates = customers.duplicated().sum()
districtsDuplicates = districts.duplicated().sum()

print(f"sessions duplicates: {sessionsDuplicates}")
print(f"stations duplicates: {stationsDuplicates}")
print(f"customers duplicates: {customersDuplicates}")
print(f"districts duplicates: {districtsDuplicates}")


sessions duplicates: 0
stations duplicates: 0
customers duplicates: 0
districts duplicates: 0


data type conversions

In [30]:
print(sessions.dtypes)
print(stations.dtypes)
print(customers.dtypes)
print(districts.dtypes)

session_id             object
customer_id            object
station_id              int64
session_start_time     object
kwh_charged           float64
cost_per_kwh          float64
total_cost            float64
dtype: object
station_id         int64
district_name     object
income_tier       object
operator_name     object
plugs_count        int64
latitude         float64
longitude        float64
dtype: object
customer_id              object
income_tier              object
car_model                object
battery_capacity_kwh    float64
dtype: object
district_name                  object
income_tier                    object
projected_evs                   int64
projected_plugs                 int64
projected_regular_customers     int64
dtype: object
