In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
data = {
    'CustomerID':[101,102,103],
    'Review': [' Great service!! ', 'delayed delivery :(', 'Excellent, will buy again!!!']
}

df = pd.DataFrame(data)
print("Original Reviews:\n", df['Review'])

Original Reviews:
 0                Great service!! 
1             delayed delivery :(
2    Excellent, will buy again!!!
Name: Review, dtype: object


In [3]:
df['CleanReview'] = df['Review'].str.strip()
df['CleanReview'] = df['CleanReview'].str.lower()
df['CleanReview'] = df['CleanReview'].str.replace(r'[^a-z0-9\s]', '', regex=True)
print("\nCleaned Reviews:\n", df['CleanReview'])


Cleaned Reviews:
 0               great service
1           delayed delivery 
2    excellent will buy again
Name: CleanReview, dtype: object


In [4]:
df = pd.DataFrame({'ProductCode': ['AB-12345', 'CD-67890', 'EF-13579']})
df['NumericPart'] = df['ProductCode'].str.extract(r'-(\d+)')
print(df)

  ProductCode NumericPart
0    AB-12345       12345
1    CD-67890       67890
2    EF-13579       13579


In [5]:
dates = ['2023-05-20', 'May 20, 2023', '20/05/2023', '20230520']
df = pd.DataFrame({'DateString': dates})
df['Date'] = pd.to_datetime(df['DateString'], dayfirst=True, errors='coerce')
print(df)

     DateString       Date
0    2023-05-20 2023-05-20
1  May 20, 2023        NaT
2    20/05/2023        NaT
3      20230520        NaT


  df['Date'] = pd.to_datetime(df['DateString'], dayfirst=True, errors='coerce')


In [6]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
print(df)

     DateString       Date    Year  Month   Day
0    2023-05-20 2023-05-20  2023.0    5.0  20.0
1  May 20, 2023        NaT     NaN    NaN   NaN
2    20/05/2023        NaT     NaN    NaN   NaN
3      20230520        NaT     NaN    NaN   NaN


In [7]:
bad_dates = ['2023-05-20', 'not a date', '2022-12-01']
df = pd.DataFrame({'DateString': bad_dates})
df['Date'] = pd.to_datetime(df['DateString'], errors='coerce')
print(df)

   DateString       Date
0  2023-05-20 2023-05-20
1  not a date        NaT
2  2022-12-01 2022-12-01


In [9]:
log_data = {
    'Timestamp': ['2023-05-20 14:23:45.123', '2023-05-21 15:00:00.000', 'invalid timestamp']
}
df = pd.DataFrame(log_data)
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')

# Extract hour and minute for time-based analysis
df['Hour'] = df['Timestamp'].dt.hour 
df['Minute'] = df['Timestamp'].dt.minute
print(df)

                Timestamp  Hour  Minute
0 2023-05-20 14:23:45.123  14.0    23.0
1 2023-05-21 15:00:00.000  15.0     0.0
2                     NaT   NaN     NaN


In [11]:
data = {
    'Age':[25,32,47,51,62],
    'Income': [50000, 60000, 80000, 120000, 90000]
}
df = pd.DataFrame(data)

#Min-Max Scaling
scaler = MinMaxScaler()
df[['Age_scaled', 'Income_scaled']] = scaler.fit_transform(df[['Age', 'Income']])

#Standardization
standardizer = StandardScaler()
df[['Age_std', 'Income_std']] = standardizer.fit_transform(df[['Age', 'Income']])

print(df)

   Age  Income  Age_scaled  Income_scaled   Age_std  Income_std
0   25   50000    0.000000       0.000000 -1.382872   -1.224745
1   32   60000    0.189189       0.142857 -0.856780   -0.816497
2   47   80000    0.594595       0.428571  0.270562    0.000000
3   51  120000    0.702703       1.000000  0.571186    1.632993
4   62   90000    1.000000       0.571429  1.397904    0.408248


In [13]:
df_cat = pd.DataFrame({'Color': ['red', 'blue', 'green', 'blue', 'red']})

#One-Hot Encoding
one_hot = pd.get_dummies(df_cat['Color'],prefix='Color')

#Label Encoding
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df_cat['Color_label'] = label_encoder.fit_transform(df_cat['Color'])

print("One-Hot Encoding:\n", one_hot) 
print("\nLabel Encoding:\n", df_cat)

One-Hot Encoding:
    Color_blue  Color_green  Color_red
0       False        False       True
1        True        False      False
2       False         True      False
3        True        False      False
4       False        False       True

Label Encoding:
    Color  Color_label
0    red            2
1   blue            0
2  green            1
3   blue            0
4    red            2


In [14]:
df_dates = pd.DataFrame({
    'Year': [2023, 2024],
    'Month': [5, 6],
    'Day': [20, 15]
})

df_dates['Date'] = pd.to_datetime(df_dates[['Year', 'Month', 'Day']])
print(df_dates)

   Year  Month  Day       Date
0  2023      5   20 2023-05-20
1  2024      6   15 2024-06-15


In [15]:
df_text = pd.DataFrame({'Review': ['Good', 'Excellent service', 'Poor']})
df_text['Review_length'] = df_text['Review'].str.len() 
print(df_text)

              Review  Review_length
0               Good              4
1  Excellent service             17
2               Poor              4


In [16]:
df = pd.DataFrame({'Score': [85, 92, 78, 90]})

def grade(score):
    if score >= 90:
        return 'A'
    elif score >= 80:
        return 'B'
    else:
        return 'C'

df['Grade'] = df['Score'].apply(grade) 
print(df)

   Score Grade
0     85     B
1     92     A
2     78     C
3     90     A


In [17]:
df_map = pd.DataFrame({'City': ['NY', 'LA', 'SF', 'LA']})
mapping = {'NY': 'New York', 'LA': 'Los Angeles', 'SF': 'San Francisco'}
df_map['City_full'] = df_map['City'].map(mapping)
print(df_map)

  City      City_full
0   NY       New York
1   LA    Los Angeles
2   SF  San Francisco
3   LA    Los Angeles


In [20]:
df = pd.DataFrame({'Income': [30000, 50000, 1000000, 40000, 60000]})
df['Log_Income'] = np.log(df['Income'])
print(df)

    Income  Log_Income
0    30000   10.308953
1    50000   10.819778
2  1000000   13.815511
3    40000   10.596635
4    60000   11.002100
