In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = {'Name': ['Fernando Torres', 'Steven Gerrard', 'Luis Suarez', 'Mohamed Salah'],
        'Date of Birth': ['1984-03-20', '1980-05-30', '1987-01-24', '1992-06-15'],
        'Age': [39, 43, 37, 31],
        'Nationality': ['Spain', 'United Kingdom', 'Uruguay', 'Egypt'],
        'Income': [ 45403410, 29339838, 199396947,  99762487],
        'Contact': ['FT 012-345-6789', 'SG 012-345-6789', 'LS 012-345-6789', 'MS 012-345-6789']}
df = pd.DataFrame(data)
print(df)

              Name Date of Birth  Age     Nationality     Income  \
0  Fernando Torres    1984-03-20   39           Spain   45403410   
1   Steven Gerrard    1980-05-30   43  United Kingdom   29339838   
2      Luis Suarez    1987-01-24   37         Uruguay  199396947   
3    Mohamed Salah    1992-06-15   31           Egypt   99762487   

           Contact  
0  FT 012-345-6789  
1  SG 012-345-6789  
2  LS 012-345-6789  
3  MS 012-345-6789  


In [3]:
# Detecting missing values
print(df.isnull())
# Detecting non-missing values
print(df.notnull())

    Name  Date of Birth    Age  Nationality  Income  Contact
0  False          False  False        False   False    False
1  False          False  False        False   False    False
2  False          False  False        False   False    False
3  False          False  False        False   False    False
   Name  Date of Birth   Age  Nationality  Income  Contact
0  True           True  True         True    True     True
1  True           True  True         True    True     True
2  True           True  True         True    True     True
3  True           True  True         True    True     True


In [4]:
df.dtypes

Name             object
Date of Birth    object
Age               int64
Nationality      object
Income            int64
Contact          object
dtype: object

In [5]:
# Removing rows with missing values
cleaned_df = df.dropna()

# Removing columns with missing values
cleaned_df = df.dropna(axis=1)

In [6]:
# Filling missing values with a specific value
filled_df = df.fillna(value=0)

# Filling missing values with the mean of the column
filled_df = df.fillna(value=df['Age'].mean())

In [7]:
# Linear interpolation
interpolated_df = df.interpolate(method='linear')

In [8]:
interpolated_df

Unnamed: 0,Name,Date of Birth,Age,Nationality,Income,Contact
0,Fernando Torres,1984-03-20,39,Spain,45403410,FT 012-345-6789
1,Steven Gerrard,1980-05-30,43,United Kingdom,29339838,SG 012-345-6789
2,Luis Suarez,1987-01-24,37,Uruguay,199396947,LS 012-345-6789
3,Mohamed Salah,1992-06-15,31,Egypt,99762487,MS 012-345-6789


In [9]:
# Converting data types
df['Age'] = df['Age'].astype('float64')

In [10]:
# Converting to DateTime
df['Date of Birth'] = pd.to_datetime(df['Date of Birth'])

In [11]:
# Converting to categorical data
df['Nationality'] = df['Nationality'].astype('category')

In [12]:
# Stripping extra spaces
df['Name'] = df['Name'].str.strip()

# Changing to lower case
df['Nationality'] = df['Nationality'].str.lower()

df

Unnamed: 0,Name,Date of Birth,Age,Nationality,Income,Contact
0,Fernando Torres,1984-03-20,39.0,spain,45403410,FT 012-345-6789
1,Steven Gerrard,1980-05-30,43.0,united kingdom,29339838,SG 012-345-6789
2,Luis Suarez,1987-01-24,37.0,uruguay,199396947,LS 012-345-6789
3,Mohamed Salah,1992-06-15,31.0,egypt,99762487,MS 012-345-6789


In [13]:
# Removing punctuation
df['Name'] = df['Name'].str.replace('[^\w\s]', '', regex=True)

# Splitting text into columns
df[['FirstName', 'LastName']] = df['Name'].str.split(' ', expand=True)

# Replacing text
df['Nationality'] = df['Nationality'].str.replace('united kingdom', 'uk')

# Changing to lower case
df['Nationality'] = df['Nationality'].str.upper()

df

Unnamed: 0,Name,Date of Birth,Age,Nationality,Income,Contact,FirstName,LastName
0,Fernando Torres,1984-03-20,39.0,SPAIN,45403410,FT 012-345-6789,Fernando,Torres
1,Steven Gerrard,1980-05-30,43.0,UK,29339838,SG 012-345-6789,Steven,Gerrard
2,Luis Suarez,1987-01-24,37.0,URUGUAY,199396947,LS 012-345-6789,Luis,Suarez
3,Mohamed Salah,1992-06-15,31.0,EGYPT,99762487,MS 012-345-6789,Mohamed,Salah


In [14]:
# Identifying duplicate rows
print(df.duplicated())

0    False
1    False
2    False
3    False
dtype: bool


In [15]:
# Removing duplicate rows, keeping the first occurrence
df_no_duplicates = df.drop_duplicates(keep='first')
df_no_duplicates

Unnamed: 0,Name,Date of Birth,Age,Nationality,Income,Contact,FirstName,LastName
0,Fernando Torres,1984-03-20,39.0,SPAIN,45403410,FT 012-345-6789,Fernando,Torres
1,Steven Gerrard,1980-05-30,43.0,UK,29339838,SG 012-345-6789,Steven,Gerrard
2,Luis Suarez,1987-01-24,37.0,URUGUAY,199396947,LS 012-345-6789,Luis,Suarez
3,Mohamed Salah,1992-06-15,31.0,EGYPT,99762487,MS 012-345-6789,Mohamed,Salah


In [16]:
# Min-Max normalization
df['Age'] = (df['Age'] - df['Age'].min()) / (df['Age'].max() - df['Age'].min())

# Z-score normalization
df['Income'] = (df['Income'] - df['Income'].mean()) / df['Income'].std()

df

Unnamed: 0,Name,Date of Birth,Age,Nationality,Income,Contact,FirstName,LastName
0,Fernando Torres,1984-03-20,0.666667,SPAIN,-0.626145,FT 012-345-6789,Fernando,Torres
1,Steven Gerrard,1980-05-30,1.0,UK,-0.835374,SG 012-345-6789,Steven,Gerrard
2,Luis Suarez,1987-01-24,0.5,URUGUAY,1.379633,LS 012-345-6789,Luis,Suarez
3,Mohamed Salah,1992-06-15,0.0,EGYPT,0.081886,MS 012-345-6789,Mohamed,Salah


In [17]:
# Log transformation
df['Log_Age'] = np.log(df['Age'] + 1)
df

Unnamed: 0,Name,Date of Birth,Age,Nationality,Income,Contact,FirstName,LastName,Log_Age
0,Fernando Torres,1984-03-20,0.666667,SPAIN,-0.626145,FT 012-345-6789,Fernando,Torres,0.510826
1,Steven Gerrard,1980-05-30,1.0,UK,-0.835374,SG 012-345-6789,Steven,Gerrard,0.693147
2,Luis Suarez,1987-01-24,0.5,URUGUAY,1.379633,LS 012-345-6789,Luis,Suarez,0.405465
3,Mohamed Salah,1992-06-15,0.0,EGYPT,0.081886,MS 012-345-6789,Mohamed,Salah,0.0


In [18]:
Q1 = df['Income'].quantile(0.25)
Q3 = df['Income'].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df['Income'] < (Q1 - 1.5 * IQR)) | (df['Income'] > (Q3 + 1.5 * IQR))]
outliers

Unnamed: 0,Name,Date of Birth,Age,Nationality,Income,Contact,FirstName,LastName,Log_Age


In [19]:
# Capping outliers
upper_limit = Q3 + 1.5 * IQR
lower_limit = Q1 - 1.5 * IQR
df['Income'] = df['Income'].clip(lower=lower_limit, upper=upper_limit)
df

Unnamed: 0,Name,Date of Birth,Age,Nationality,Income,Contact,FirstName,LastName,Log_Age
0,Fernando Torres,1984-03-20,0.666667,SPAIN,-0.626145,FT 012-345-6789,Fernando,Torres,0.510826
1,Steven Gerrard,1980-05-30,1.0,UK,-0.835374,SG 012-345-6789,Steven,Gerrard,0.693147
2,Luis Suarez,1987-01-24,0.5,URUGUAY,1.379633,LS 012-345-6789,Luis,Suarez,0.405465
3,Mohamed Salah,1992-06-15,0.0,EGYPT,0.081886,MS 012-345-6789,Mohamed,Salah,0.0


In [20]:
# Applying square root transformation to reduce right skew
df['sqrt_Income'] = df['Income'].apply(np.sqrt)
df

Unnamed: 0,Name,Date of Birth,Age,Nationality,Income,Contact,FirstName,LastName,Log_Age,sqrt_Income
0,Fernando Torres,1984-03-20,0.666667,SPAIN,-0.626145,FT 012-345-6789,Fernando,Torres,0.510826,
1,Steven Gerrard,1980-05-30,1.0,UK,-0.835374,SG 012-345-6789,Steven,Gerrard,0.693147,
2,Luis Suarez,1987-01-24,0.5,URUGUAY,1.379633,LS 012-345-6789,Luis,Suarez,0.405465,1.174578
3,Mohamed Salah,1992-06-15,0.0,EGYPT,0.081886,MS 012-345-6789,Mohamed,Salah,0.0,0.286158


In [21]:
# One-hot encoding
encoded_df = pd.get_dummies(df, columns=['Nationality'])
encoded_df

Unnamed: 0,Name,Date of Birth,Age,Income,Contact,FirstName,LastName,Log_Age,sqrt_Income,Nationality_EGYPT,Nationality_SPAIN,Nationality_UK,Nationality_URUGUAY
0,Fernando Torres,1984-03-20,0.666667,-0.626145,FT 012-345-6789,Fernando,Torres,0.510826,,False,True,False,False
1,Steven Gerrard,1980-05-30,1.0,-0.835374,SG 012-345-6789,Steven,Gerrard,0.693147,,False,False,True,False
2,Luis Suarez,1987-01-24,0.5,1.379633,LS 012-345-6789,Luis,Suarez,0.405465,1.174578,False,False,False,True
3,Mohamed Salah,1992-06-15,0.0,0.081886,MS 012-345-6789,Mohamed,Salah,0.0,0.286158,True,False,False,False


In [22]:
# Extracting phone numbers
df['Phone_Number'] = df['Contact'].str.extract(r'(\d{3}-\d{3}-\d{4})')
df

Unnamed: 0,Name,Date of Birth,Age,Nationality,Income,Contact,FirstName,LastName,Log_Age,sqrt_Income,Phone_Number
0,Fernando Torres,1984-03-20,0.666667,SPAIN,-0.626145,FT 012-345-6789,Fernando,Torres,0.510826,,012-345-6789
1,Steven Gerrard,1980-05-30,1.0,UK,-0.835374,SG 012-345-6789,Steven,Gerrard,0.693147,,012-345-6789
2,Luis Suarez,1987-01-24,0.5,URUGUAY,1.379633,LS 012-345-6789,Luis,Suarez,0.405465,1.174578,012-345-6789
3,Mohamed Salah,1992-06-15,0.0,EGYPT,0.081886,MS 012-345-6789,Mohamed,Salah,0.0,0.286158,012-345-6789
