In [13]:
import pandas as pd 

data = {
      'Name': ['Pawan', 'Kapil', 'Lalit', 'Shubham', 'Jatin'],
      'Age': [35, None, 67, 23, None],
      'Salary': [50000, 60000, 70000, None, None]
}

df  = pd.DataFrame(data)
print("Original DataFrame")
print(df)

Original DataFrame
      Name   Age   Salary
0    Pawan  35.0  50000.0
1    Kapil   NaN  60000.0
2    Lalit  67.0  70000.0
3  Shubham  23.0      NaN
4    Jatin   NaN      NaN


In [14]:
# Handling missing data
df.isnull().sum()

Name      0
Age       2
Salary    2
dtype: int64

In [15]:
df.dropna()

Unnamed: 0,Name,Age,Salary
0,Pawan,35.0,50000.0
2,Lalit,67.0,70000.0


In [16]:
df['Age'].fillna(df['Age'].mean(),inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)
print(df)

# Find the percentage of the missing data
print(df.isnull().mean() * 100)

      Name        Age   Salary
0    Pawan  35.000000  50000.0
1    Kapil  41.666667  60000.0
2    Lalit  67.000000  70000.0
3  Shubham  23.000000  60000.0
4    Jatin  41.666667  60000.0
Name      0.0
Age       0.0
Salary    0.0
dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].mean(), inplace=True)


In [17]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

df = pd.read_csv("sample_data.csv")

df_label = df.copy()

le = LabelEncoder()

#Encode categorical values
df_label['Gender_Encoded'] = le.fit_transform(df_label['Gender'])
df_label['Passed_Encoded'] = le.fit_transform(df_label['Passed'])

print("\n Label Encoded Data")
print(df_label[['Name', 'Gender', 'Gender_Encoded', 'Passed', 'Passed_Encoded']])


 Label Encoded Data
      Name  Gender  Gender_Encoded Passed  Passed_Encoded
0     Aman    Male               1    Yes               1
1    Priya  Female               0    Yes               1
2    Rahul    Male               1     No               0
3   Anjali  Female               0    Yes               1
4     Ravi    Male               1    Yes               1
5    Meera  Female               0     No               0
6    Arjun    Male               1    Yes               1
7     Neha  Female               0    Yes               1
8    Imran    Male               1     No               0
9    Sneha  Female               0    Yes               1
10     Raj    Male               1    Yes               1
11   Divya  Female               0     No               0
12   Kabir    Male               1    Yes               1
13  Simran  Female               0    Yes               1
14   Karan    Male               1     No               0
15   Pooja  Female               0    Yes          

In [18]:
#Encode categorical values
df_encoded = pd.get_dummies(df_label, columns=['City'], dtype=int)
print("\n One-hot Encoded Data")
print(df_encoded)


 One-hot Encoded Data
      Name  Gender Passed  Gender_Encoded  Passed_Encoded  City_Bangalore  \
0     Aman    Male    Yes               1               1               0   
1    Priya  Female    Yes               0               1               0   
2    Rahul    Male     No               1               0               1   
3   Anjali  Female    Yes               0               1               0   
4     Ravi    Male    Yes               1               1               0   
5    Meera  Female     No               0               0               0   
6    Arjun    Male    Yes               1               1               1   
7     Neha  Female    Yes               0               1               0   
8    Imran    Male     No               1               0               0   
9    Sneha  Female    Yes               0               1               0   
10     Raj    Male    Yes               1               1               0   
11   Divya  Female     No               0            

In [19]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

# Example dataset
X = np.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9]])

# Standard Scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Standard Scaled:\n", X_scaled)

# MinMax Scaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
print("\nMinMax Scaled:\n", X_scaled)

Standard Scaled:
 [[-1.22474487 -1.22474487 -1.22474487]
 [ 0.          0.          0.        ]
 [ 1.22474487  1.22474487  1.22474487]]

MinMax Scaled:
 [[0.  0.  0. ]
 [0.5 0.5 0.5]
 [1.  1.  1. ]]


In [20]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd

data = {
      'StudyHours': [1,2,3,4,5,6],
      'TestSorce': [40,50,60,70,80,90]
}

df = pd.DataFrame(data)

# Standard scaling 
Standard_scaler = StandardScaler()
Standard_scaled = Standard_scaler.fit_transform(df)

print("Standard Scaler Output")

print(pd.DataFrame(Standard_scaled, columns=['StudyHours', 'TestScore']))

# in-Max Scaling
minmax_scaler = MinMaxScaler()
minmax_scaled = minmax_scaler.fit_transform(df)
print("\n Min Max Scaler Ouput")
print(pd.DataFrame(minmax_scaled, columns=['StudyHours', 'TestScore']))

# Train-Test Split
X = df[['StudyHours']]
y = df[['TestSorce']]

X_train , X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

print("Training Data")
print(X_train)

print("Testing Data")
print(X_test)

print("Training Data")
print(y_train)

print("Testing Data")
print(y_test)

Standard Scaler Output
   StudyHours  TestScore
0    -1.46385   -1.46385
1    -0.87831   -0.87831
2    -0.29277   -0.29277
3     0.29277    0.29277
4     0.87831    0.87831
5     1.46385    1.46385

 Min Max Scaler Ouput
   StudyHours  TestScore
0         0.0        0.0
1         0.2        0.2
2         0.4        0.4
3         0.6        0.6
4         0.8        0.8
5         1.0        1.0
Training Data
   StudyHours
5           6
2           3
4           5
3           4
Testing Data
   StudyHours
0           1
1           2
Training Data
   TestSorce
5         90
2         60
4         80
3         70
Testing Data
   TestSorce
0         40
1         50


In [21]:
# Linear Regression

from sklearn.linear_model import LinearRegression

X = [[1], [2], [3], [4], [5]]
y = [30, 45, 58, 70, 90]

model = LinearRegression()

model.fit(X, y) # Train the model using data 

hours = float(input("Enter the hours you will study: "))

predicted_marks = model.predict([[hours]]) # Predict the bases of input

print(f"Based on your marks {hours} you may scored around {predicted_marks} marks.")

Enter the hours you will study:  5


Based on your marks 5.0 you may scored around [87.6] marks.


In [22]:
# Classification -> Logistic Regression

from sklearn.linear_model import LogisticRegression
X = [[1], [2], [3], [4], [5]] # hour stody input
y = [0, 0, 1, 1, 1] # result 0F, 1P

model = LogisticRegression()

model.fit(X,y)

hours = float(input("Enter the hours you will study: "))

predicted_result = model.predict([[hours]])[0]

if predicted_result == 1:
      print(f"Based on hours {hours}, you are likely to PASS")
else:
      print(f"Based on hours {hours}, you are likely to FAIL!!")

Enter the hours you will study:  4


Based on hours 4.0, you are likely to PASS


In [23]:
# KNN(K - Nearest Neighbours)

from sklearn.neighbors import KNeighborsClassifier

X = [
      [180, 7],
      [200, 7.5],
      [250, 8],
      [300, 8.5],
      [330, 9],
      [360, 9.5]
]

y = [0,0,0,1,1,1]

model = KNeighborsClassifier(n_neighbors = 3)

model.fit(X, y)

weight = float(input("Enter the weight in gram: "))
size = float(input("Enter the size in cm: "))

prediction = model.predict([[weight, size]])[0] # 0 for the print the content inside the list 

if prediction == 0:
      print("This is likely an Apple")
else:
      print("This is likely an Orange")

Enter the weight in gram:  3
Enter the size in cm:  3


This is likely an Apple


In [24]:
# Decision Trees
from sklearn.tree import DecisionTreeClassifier

X = [
      [7, 2],
      [8, 3],
      [9, 8],
      [10, 9]
]

y = [0, 0, 1, 1]

model = DecisionTreeClassifier()

model.fit(X, y)

size = float(input("Enter the size of fruit in cm: "))
shade = float(input("Enter the shade of fruit (1-10): "))

result = model.predict([[size, shade]])[0]

if result == 0:
      print("This is likely an Apple")
else:
      print("This is likely an Orange")

Enter the size of fruit in cm:  4
Enter the shade of fruit (1-10):  7


This is likely an Apple


In [25]:
# Mastering Model Evaluation 

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# True answer (what actually happend)
y_true = [1, 0, 1, 1, 0, 1, 0]

# Model's prediction (what it guessed)
y_predit = [1, 0, 1, 0, 0, 1, 1]

# Evaluation 
print("Accuracy: ", accuracy_score(y_true, y_predit))
print("Precision: ", precision_score(y_true, y_predit))
print("Recall: ", recall_score(y_true, y_predit))
print("F1-score: ", f1_score(y_true, y_predit))

Accuracy:  0.7142857142857143
Precision:  0.75
Recall:  0.75
F1-score:  0.75


In [26]:
#Confusion Matrix

from sklearn.metrics import confusion_matrix

y_true = [1, 0, 1, 1, 0, 1, 0, 0, 1, 0]
y_predit = [1, 0, 1, 0, 0, 1, 1, 0, 1, 0]

cm = confusion_matrix(y_true,  y_predit)

print("Confusion Matrix") 
print(cm)                # 4 - TN | 1 - FP | 4 - TP | 1 - FN

Confusion Matrix
[[4 1]
 [1 4]]


In [27]:
# Mean Absolute Error(MAE)
"""
1 - Take the Mistake difference 
2 - Remove the minus sign 
3 - add 
4 - divide
"""
from sklearn.metrics import mean_absolute_error as mae
 
actual = [2,3,5,5,9]
calculated = [3,3,8,7,6]

error = mae(actual, calculated)

print("Mean Absolute Error: ", error)

Mean Absolute Error:  1.8


In [29]:
# Mean Squared Error (MSE)
"""
1 - Mistakes square them
2 - add
3 - divide by total value """

from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# real score
real_score = [90, 60, 80, 100]

# model guess
predicted_score = [85, 70, 70, 95]

mae = mean_absolute_error(real_score, predicted_score)

mse = mean_squared_error(real_score, predicted_score)

rmse = np.sqrt(mse)

print("MAE: On Average of by: ", mae)
print("MSE: Squared mistake values: ", mse)
print("RMSE: Final Realistic error: ", rmse)

MAE: On Average of by:  7.5
MSE: Squared mistake values:  62.5
RMSE: Final Realistic error:  7.905694150420948
