In [1]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np

from src.utils import fetch_data_from_db

In [2]:
# Load the data
query = "SELECT * FROM xdr_data"
df = fetch_data_from_db(query)

In [3]:



def preprocess_data(df, timestamp_cols, numeric_features, categorical_features):
    
    df_numeric.reset_index(drop=True, inplace=True)
    df_categorical.reset_index(drop=True, inplace=True)
    df_preprocessed = pd.concat([df_numeric, df_categorical], axis=1)
    
    return df_preprocessed

In [17]:
# Replace 'undefined' with NaN
df.replace('undefined', np.nan, inplace=True)

In [18]:
# Preprocess the data
timestamp_cols = ['Start', 'End']
numeric_features = None
categorical_features = ['IMSI', 'MSISDN/Number', 'IMEI', 'Last Location Name', 'Handset Type', 'Handset Manufacturer']


In [19]:
# Convert the 'End time of the xDR (last frame timestamp)' and 'Start' columns to datetime
timestamp_features = []
for col in timestamp_cols:
    df[col] = pd.to_datetime(df[col])

    # Extract features from the timestamp columns
    df[f'{col}_hour'] = df[col].dt.hour
    df[f'{col}_day_of_week'] = df[col].dt.dayofweek
    df[f'{col}_day_of_month'] = df[col].dt.day
    df[f'{col}_month'] = df[col].dt.month

    # Add the new features to the timestamp_features list
    timestamp_features.extend([f'{col}_hour', f'{col}_day_of_week', f'{col}_day_of_month', f'{col}_month'])

# Identify numeric columns
if numeric_features is None:
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove timestamp features from the numeric_features list
numeric_features = [feature for feature in numeric_features if feature not in timestamp_features]

# Define preprocessing for numeric columns (replace missing values with mean and scale values)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Define preprocessing for categorical columns (replace missing values with mode)
# Define preprocessing for categorical columns (replace missing values with mode)
# Define preprocessing for categorical columns (replace missing values with 'Unknown')
# Define preprocessing for categorical columns (replace missing values with mode)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])


In [20]:
# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [21]:

# Fit and transform the data
df_preprocessed = preprocessor.fit_transform(df)


In [15]:

# Get feature names after transformation
numeric_features_transformed = preprocessor.transformers_[0][-1]
categorical_features_transformed = preprocessor.transformers_[1][-1]

# Convert the preprocessed data back to a DataFrame
df_preprocessed = pd.DataFrame(df_preprocessed, columns=numeric_features_transformed + categorical_features_transformed)

In [16]:

# Convert the preprocessed data back to a DataFrame
# Join dataframes on index
# combined_df = df_numeric.join(df_categorical)

df_preprocessed.head(200)

Unnamed: 0,Bearer Id,Start ms,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Avg RTT DL (ms),Avg RTT UL (ms),Avg Bearer TP DL (kbps),...,Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes),IMSI.1,MSISDN/Number.1,IMEI.1,Last Location Name,Handset Type,Handset Manufacturer
0,1.031911,0.938332,0.566475,21.213047,-0.008932,-0.00337,-0.578959,-0.121206,-0.165464,-0.553863,...,-1.025295,0.115243,-0.387668,-0.597047,208201448079117.0,33664962239.0,35521209507511.0,9.16456699548519E+015,Samsung Galaxy A5 Sm-A520F,Samsung
1,1.031911,-0.915382,0.372095,15.554551,0.012569,-0.003363,-0.566766,-0.080087,-0.165464,-0.554155,...,0.435041,1.423855,1.124409,0.814043,208201909211140.0,33681854413.0,35794009006359.0,L77566A,Samsung Galaxy J5 (Sm-J530),Samsung
2,1.031911,-1.726165,0.531765,15.513311,-0.061789,-0.003331,-0.589672,-0.0,0.0,-0.554572,...,-0.042795,-0.849035,-1.173927,-0.716127,208200314458056.0,33760627129.0,35281510359387.0,D42335A,Samsung Galaxy A8 (2018),Samsung
3,1.031911,-0.045696,-1.137819,15.016588,-0.011065,-0.003335,-0.586315,-0.0,0.0,-0.552987,...,1.348412,0.950408,0.195366,1.603109,208201402342131.0,33750343200.0,35356610164913.0,T21824A,Huawei B528S-23A,Apple
4,1.031911,0.22803,1.580027,12.147531,-0.011108,-0.003355,-0.584063,-0.0,0.0,-0.554572,...,0.532922,1.183799,-0.228656,0.468971,208201401415120.0,33699795932.0,35407009745539.0,D88865A,Samsung Sm-G390F,Samsung
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,-0.967194,0.855174,0.854574,0.513869,-0.008948,-0.003368,-0.585203,-0.158751,-0.191597,-0.552862,...,0.602386,-0.80206,0.285382,0.302916,208201447750120.0,33668451771.0,35381508463192.0,D12742C,Apple iPhone 7 Plus (A1784),Apple
196,1.031911,0.5468,1.715399,0.59883,-0.029379,-0.003327,-0.580299,-0.14266,-0.113196,-0.55257,...,-0.586588,0.947512,-0.90315,-1.039031,208201009545212.0,33768127237.0,35491209048289.0,T42450A,Apple iPhone 7 (A1660),Apple
197,1.031911,-1.11981,1.406474,0.543325,-0.029412,-0.003371,-0.584139,-0.105116,-0.165464,-0.554489,...,-0.237292,0.26915,0.28848,0.721782,208201008858129.0,33662105508.0,35405309402701.0,D92954B,Samsung Galaxy J5 (Sm-J530),Samsung
198,1.031911,-1.667262,1.514077,0.515325,0.012587,-0.003369,1.682577,-0.124782,-0.13933,-0.552319,...,-1.688325,0.401961,-1.259991,1.29844,208201909612196.0,33667238502.0,86119704141493.0,D73604C,Huawei P20,Huawei


In [2]:
from src.preprocessing import preprocess_data
from src.utils import fetch_telecom_data

# Load the data
df = fetch_telecom_data()

# Define the columns
timestamp_cols = ['Start', 'End']
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = ['IMSI', 'MSISDN/Number', 'IMEI', 'Last Location Name', 'Handset Type', 'Handset Manufacturer']

# Preprocess the data
df_preprocessed = preprocess_data(df, timestamp_cols, numeric_features, categorical_features)

In [3]:
df_preprocessed.head()

Unnamed: 0,Bearer Id,Start ms,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Avg RTT DL (ms),Avg RTT UL (ms),Avg Bearer TP DL (kbps),...,Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes),IMSI.1,MSISDN/Number.1,IMEI.1,Last Location Name,Handset Type,Handset Manufacturer
0,1.031911,0.938332,0.566475,21.213047,-0.008932,-0.00337,-0.578959,-0.121206,-0.165464,-0.553863,...,-1.025295,0.115243,-0.387668,-0.597047,208201448079117.0,33664962239.0,35521209507511.0,9.16456699548519E+015,Samsung Galaxy A5 Sm-A520F,Samsung
1,1.031911,-0.915382,0.372095,15.554551,0.012569,-0.003363,-0.566766,-0.080087,-0.165464,-0.554155,...,0.435041,1.423855,1.124409,0.814043,208201909211140.0,33681854413.0,35794009006359.0,L77566A,Samsung Galaxy J5 (Sm-J530),Samsung
2,1.031911,-1.726165,0.531765,15.513311,-0.061789,-0.003331,-0.589672,-0.0,0.0,-0.554572,...,-0.042795,-0.849035,-1.173927,-0.716127,208200314458056.0,33760627129.0,35281510359387.0,D42335A,Samsung Galaxy A8 (2018),Samsung
3,1.031911,-0.045696,-1.137819,15.016588,-0.011065,-0.003335,-0.586315,-0.0,0.0,-0.552987,...,1.348412,0.950408,0.195366,1.603109,208201402342131.0,33750343200.0,35356610164913.0,T21824A,Huawei B528S-23A,Apple
4,1.031911,0.22803,1.580027,12.147531,-0.011108,-0.003355,-0.584063,-0.0,0.0,-0.554572,...,0.532922,1.183799,-0.228656,0.468971,208201401415120.0,33699795932.0,35407009745539.0,D88865A,Samsung Sm-G390F,Samsung


In [4]:
# Save the preprocessed data
from src.utils import save_data_to_db
save_data_to_db(df_preprocessed, 'clean_data')

DuplicateColumnError: A column with name 'IMSI' is already present in table 'clean_data'.