In [40]:
# Import libraries
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification
from collections import Counter

In [41]:
df=pd.read_csv("my_movies.csv")
print(df)

              V1             V2             V3          V4     V5  \
0    Sixth Sense          LOTR1  Harry Potter1  Green Mile  LOTR2   
1      Gladiator        Patriot     Braveheart         NaN    NaN   
2          LOTR1          LOTR2            NaN         NaN    NaN   
3      Gladiator        Patriot    Sixth Sense         NaN    NaN   
4      Gladiator        Patriot    Sixth Sense         NaN    NaN   
5      Gladiator        Patriot    Sixth Sense         NaN    NaN   
6  Harry Potter1  Harry Potter2            NaN         NaN    NaN   
7      Gladiator        Patriot            NaN         NaN    NaN   
8      Gladiator        Patriot    Sixth Sense         NaN    NaN   
9    Sixth Sense           LOTR      Gladiator  Green Mile    NaN   

   Sixth Sense  Gladiator  LOTR1  Harry Potter1  Patriot  LOTR2  \
0            1          0      1              1        0      1   
1            0          1      0              0        1      0   
2            0          0      1       

In [42]:
print(df.columns)


Index(['V1', 'V2', 'V3', 'V4', 'V5', 'Sixth Sense', 'Gladiator', 'LOTR1',
       'Harry Potter1', 'Patriot', 'LOTR2', 'Harry Potter2', 'LOTR',
       'Braveheart', 'Green Mile'],
      dtype='object')


In [43]:
df

Unnamed: 0,V1,V2,V3,V4,V5,Sixth Sense,Gladiator,LOTR1,Harry Potter1,Patriot,LOTR2,Harry Potter2,LOTR,Braveheart,Green Mile
0,Sixth Sense,LOTR1,Harry Potter1,Green Mile,LOTR2,1,0,1,1,0,1,0,0,0,1
1,Gladiator,Patriot,Braveheart,,,0,1,0,0,1,0,0,0,1,0
2,LOTR1,LOTR2,,,,0,0,1,0,0,1,0,0,0,0
3,Gladiator,Patriot,Sixth Sense,,,1,1,0,0,1,0,0,0,0,0
4,Gladiator,Patriot,Sixth Sense,,,1,1,0,0,1,0,0,0,0,0
5,Gladiator,Patriot,Sixth Sense,,,1,1,0,0,1,0,0,0,0,0
6,Harry Potter1,Harry Potter2,,,,0,0,0,1,0,0,1,0,0,0
7,Gladiator,Patriot,,,,0,1,0,0,1,0,0,0,0,0
8,Gladiator,Patriot,Sixth Sense,,,1,1,0,0,1,0,0,0,0,0
9,Sixth Sense,LOTR,Gladiator,Green Mile,,1,1,0,0,0,0,0,1,0,1


In [44]:
df.dtypes

V1               object
V2               object
V3               object
V4               object
V5               object
Sixth Sense       int64
Gladiator         int64
LOTR1             int64
Harry Potter1     int64
Patriot           int64
LOTR2             int64
Harry Potter2     int64
LOTR              int64
Braveheart        int64
Green Mile        int64
dtype: object

In [45]:
df.describe()

Unnamed: 0,Sixth Sense,Gladiator,LOTR1,Harry Potter1,Patriot,LOTR2,Harry Potter2,LOTR,Braveheart,Green Mile
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.6,0.7,0.2,0.2,0.6,0.2,0.1,0.1,0.1,0.2
std,0.516398,0.483046,0.421637,0.421637,0.516398,0.421637,0.316228,0.316228,0.316228,0.421637
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [46]:
# Step 1: Create an imbalanced dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.9, 0.1], n_samples=1000, random_state=42)

In [47]:
X

array([[-0.6693561 , -0.16949709, -0.87076638, ..., -1.26733697,
         0.60271356,  1.01664321],
       [ 0.09337237,  1.63325827,  0.10575379, ..., -0.12270893,
         1.42291559,  0.91136272],
       [-0.90579721,  1.88316675,  0.29514098, ...,  0.83049813,
         1.68353405, -0.5782121 ],
       ...,
       [-0.20013455, -2.30849169,  1.79701652, ..., -1.50280171,
        -2.00422224,  1.60111869],
       [ 0.03935575,  0.2772556 , -0.47532342, ...,  0.09912579,
         0.8836702 ,  1.20827474],
       [ 0.76921528,  0.49933738,  0.16994471, ...,  0.6561162 ,
         0.98430978, -2.02100232]])

In [48]:
# Check original class distribution
print(f"Original class distribution: {Counter(y)}")

Original class distribution: Counter({0: 897, 1: 103})


In [49]:
# Step 2: Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [50]:
# Step 3: Apply SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [51]:
# Check the class distribution after SMOTE
print(f"Resampled class distribution: {Counter(y_resampled)}")

Resampled class distribution: Counter({1: 627, 0: 627})


In [52]:
# Step 4: Train a model (e.g., RandomForestClassifier) on the resampled data
model = RandomForestClassifier(random_state=42)
model.fit(X_resampled, y_resampled)

In [53]:
# Step 5: Predict and evaluate the model
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       270
           1       0.93      0.87      0.90        30

    accuracy                           0.98       300
   macro avg       0.96      0.93      0.94       300
weighted avg       0.98      0.98      0.98       300

