## 1. Import Libraries

In [228]:
import pandas as pd

import numpy as np

import os

from sklearn.model_selection import train_test_split

## 2. Import Dataset

In [229]:
file_path=r"C:\Users\Debasish Das\Desktop\Movie_Genre\Genre Classification Dataset"
def get_data(name):
    file_name=f"{name}.txt"
    full_path=os.path.join(file_path,file_name)
    return pd.read_table(full_path,header=None)


In [230]:
train=get_data("train")
train

Unnamed: 0,0
0,1 ::: Oscar et la dame rose (2009) ::: drama :...
1,2 ::: Cupid (1997) ::: thriller ::: A brother ...
2,"3 ::: Young, Wild and Wonderful (1980) ::: adu..."
3,4 ::: The Secret Sin (1915) ::: drama ::: To h...
4,5 ::: The Unrecovered (2007) ::: drama ::: The...
...,...
54209,"54210 ::: ""Bonino"" (1953) ::: comedy ::: This ..."
54210,54211 ::: Dead Girls Don't Cry (????) ::: horr...
54211,54212 ::: Ronald Goedemondt: Ze bestaan echt (...
54212,54213 ::: Make Your Own Bed (1944) ::: comedy ...


In [231]:
test=get_data("test")
test

Unnamed: 0,0
0,1 ::: Edgar's Lunch (1998) ::: thriller ::: L....
1,2 ::: La guerra de papá (1977) ::: comedy ::: ...
2,3 ::: Off the Beaten Track (2010) ::: document...
3,4 ::: Meu Amigo Hindu (2015) ::: drama ::: His...
4,5 ::: Er nu zhai (1955) ::: drama ::: Before h...
...,...
54195,"54196 ::: ""Tales of Light & Dark"" (2013) ::: h..."
54196,54197 ::: Der letzte Mohikaner (1965) ::: west...
54197,54198 ::: Oliver Twink (2007) ::: adult ::: A ...
54198,54199 ::: Slipstream (1973) ::: drama ::: Popu...


- No columns
- Row data are marge in one one paragraph
- Different data split by `:::`

### 2.1 Marge Dataframe

In [232]:
genre=pd.concat([train,test])
genre

Unnamed: 0,0
0,1 ::: Oscar et la dame rose (2009) ::: drama :...
1,2 ::: Cupid (1997) ::: thriller ::: A brother ...
2,"3 ::: Young, Wild and Wonderful (1980) ::: adu..."
3,4 ::: The Secret Sin (1915) ::: drama ::: To h...
4,5 ::: The Unrecovered (2007) ::: drama ::: The...
...,...
54195,"54196 ::: ""Tales of Light & Dark"" (2013) ::: h..."
54196,54197 ::: Der letzte Mohikaner (1965) ::: west...
54197,54198 ::: Oliver Twink (2007) ::: adult ::: A ...
54198,54199 ::: Slipstream (1973) ::: drama ::: Popu...


### 2.2. Column 

In [233]:
genre[["movie_id","title","genre","description"]]=genre[0].str.split(":::",expand=True)
genre.drop(columns=0,inplace=True)


## 3.  Preliminary Analysis

### 3.1 Data Types

In [234]:
genre.info()

<class 'pandas.core.frame.DataFrame'>
Index: 108414 entries, 0 to 54199
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   movie_id     108414 non-null  object
 1   title        108414 non-null  object
 2   genre        108414 non-null  object
 3   description  108414 non-null  object
dtypes: object(4)
memory usage: 4.1+ MB


- `movie_id` must be in int format

### 3.2 Check Duplicates

In [235]:
train.duplicated().sum()

np.int64(0)

## 4. Detailed Analysis Of every column

### 4.1. movie_id

In [236]:
genre.movie_id.unique().shape

(54214,)

- `movie_id` not useful column

### 4.2 title

In [237]:
genre.title

0             Oscar et la dame rose (2009) 
1                             Cupid (1997) 
2         Young, Wild and Wonderful (1980) 
3                    The Secret Sin (1915) 
4                   The Unrecovered (2007) 
                        ...                
54195       "Tales of Light & Dark" (2013) 
54196          Der letzte Mohikaner (1965) 
54197                  Oliver Twink (2007) 
54198                    Slipstream (1973) 
54199            Curitiba Zero Grau (2010) 
Name: title, Length: 108414, dtype: object

- we have to make another column `year` 

In [238]:
genre.assign(
    title=lambda x: x["title"].str.split("(").str[0].str.strip()
)

Unnamed: 0,movie_id,title,genre,description
0,1,Oscar et la dame rose,drama,Listening in to a conversation between his do...
1,2,Cupid,thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful",adult,As the bus empties the students for their fie...
3,4,The Secret Sin,drama,To help their unemployed father make ends mee...
4,5,The Unrecovered,drama,The film's title refers not only to the un-re...
...,...,...,...,...
54195,54196,"""Tales of Light & Dark""",horror,"Covering multiple genres, Tales of Light & Da..."
54196,54197,Der letzte Mohikaner,western,As Alice and Cora Munro attempt to find their...
54197,54198,Oliver Twink,adult,A movie 169 years in the making. Oliver Twist...
54198,54199,Slipstream,drama,"Popular, but mysterious rock D.J Mike Mallard..."


In [239]:
genre.assign(
    year=lambda x: x["title"].str.split("(").str[1].str.strip(" )/IVSR").apply(lambda y: int(y) if y.isnumeric() else 0)
    #(" ?/IVSRworkingtitleAstroTurf Wa)").replace(['"',''],0).str.split(")").str[0].fillna(0).astype(int)
)


Unnamed: 0,movie_id,title,genre,description,year
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,2009
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...,1997
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,1980
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,1915
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,2007
...,...,...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Da...",2013
54196,54197,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their...,1965
54197,54198,Oliver Twink (2007),adult,A movie 169 years in the making. Oliver Twist...,2007
54198,54199,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard...",1973


### 4.3 genre

In [240]:
genre["genre"].value_counts()

genre
drama           27225
documentary     26192
comedy          14893
short           10145
horror           4408
thriller         3181
action           2629
western          2064
reality-tv       1767
family           1567
adventure        1550
music            1462
romance          1344
sci-fi           1293
adult            1180
crime            1010
animation         996
sport             863
talk-show         782
fantasy           645
mystery           637
musical           553
biography         529
history           486
game-show         387
news              362
war               264
Name: count, dtype: int64

In [241]:
genre["genre"].describe(include="O")

count      108414
unique         27
top        drama 
freq        27225
Name: genre, dtype: object

- Imbalenced Data 

### 4.4. Description

In [242]:
genre.description

0         Listening in to a conversation between his do...
1         A brother and sister with a past incestuous r...
2         As the bus empties the students for their fie...
3         To help their unemployed father make ends mee...
4         The film's title refers not only to the un-re...
                               ...                        
54195     Covering multiple genres, Tales of Light & Da...
54196     As Alice and Cora Munro attempt to find their...
54197     A movie 169 years in the making. Oliver Twist...
54198     Popular, but mysterious rock D.J Mike Mallard...
54199     Curitiba is a city in movement, with rhythms ...
Name: description, Length: 108414, dtype: object

## 5. Data Cleaning 

In [243]:
def clean_data(df):
    # Ensure the title column is in string format
    df["title"] = df["title"].astype(str)
    return (
         df
         .drop(columns=["movie_id"])
         .assign(
        title_cleaned=lambda x: x["title"].str.split("(").str[0].str.strip(),
        year=lambda x: x["title"].str.split("(").str[1].str.split(")").str[0].apply(lambda y: int(y) if y.isnumeric() else 0)
    )
         .drop(columns=["title"])
         .rename(columns={"title_cleaned": "title"})
    )

In [244]:
final_text=clean_data(genre)

In [245]:
final_text.info()

<class 'pandas.core.frame.DataFrame'>
Index: 108414 entries, 0 to 54199
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   genre        108414 non-null  object
 1   description  108414 non-null  object
 2   title        108414 non-null  object
 3   year         108414 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 4.1+ MB


## 6. Split The Data

In [246]:
x=final_text.drop(columns=["genre"])
y=final_text["genre"]

In [247]:
print(x.shape)
print(y.shape)

(108414, 3)
(108414,)


In [248]:
x_,x_test,y_,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train,x_val,y_train,y_val=train_test_split(x_,y_,test_size=0.2,random_state=42)

print(x_.shape)
print(y_.shape)
print(x_test.shape)
print(y_test.shape)
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

(86731, 3)
(86731,)
(21683, 3)
(21683,)
(69384, 3)
(69384,)
(17347, 3)
(17347,)


## 7.Export Data

In [249]:
def export(x, y, name):
    file_name = f"{name}.csv"
    full_path = os.path.join(file_path, file_name)
    combined_df = pd.concat([x, y], axis=1)
    combined_df.to_csv(full_path, index=False)
    return pd.read_csv(full_path).info()

In [250]:
export(x_train,y_train,"train-c")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69384 entries, 0 to 69383
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  69384 non-null  object
 1   title        69369 non-null  object
 2   year         69384 non-null  int64 
 3   genre        69384 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.1+ MB


In [251]:
export(x_test,y_test,"test-c")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21683 entries, 0 to 21682
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  21683 non-null  object
 1   title        21680 non-null  object
 2   year         21683 non-null  int64 
 3   genre        21683 non-null  object
dtypes: int64(1), object(3)
memory usage: 677.7+ KB


In [252]:
export(x_val,y_val,"val-c")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17347 entries, 0 to 17346
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  17347 non-null  object
 1   title        17346 non-null  object
 2   year         17347 non-null  int64 
 3   genre        17347 non-null  object
dtypes: int64(1), object(3)
memory usage: 542.2+ KB
