In [1]:
#DATAANALYSIS PROJECTS

In [2]:
#USE PYTHON WITH PANDAS AND NUMPY
#PERFORM FEATURE ENGINEERING AS PART OF DATA PROCESSING

In [3]:
#IMPRTING PANDAS AS pd AND NUMPY AS np
import pandas as pd
import numpy as np


In [4]:
# Create a "messy" movie dataset
data = {
    'Title': ['Action Hero', 'Love in Paris', 'Space Wars', 'The Joke', 'Mystery Island', 'Silent Hill'],
    'Release_Date': ['2023-05-12', '2023-02-14', '2023-12-15', '2023-07-04', '2023-10-31', np.nan],
    'Genre': ['Action', 'Romance', 'Sci-Fi|Action', 'Comedy', 'Mystery', 'Horror'],
    'Runtime_Min': [120, 95, 150, 85, np.nan, 110],
    'Budget_M': [100, 20, 250, 10, 50, 15]
}

df = pd.DataFrame(data)

In [5]:
df

Unnamed: 0,Title,Release_Date,Genre,Runtime_Min,Budget_M
0,Action Hero,2023-05-12,Action,120.0,100
1,Love in Paris,2023-02-14,Romance,95.0,20
2,Space Wars,2023-12-15,Sci-Fi|Action,150.0,250
3,The Joke,2023-07-04,Comedy,85.0,10
4,Mystery Island,2023-10-31,Mystery,,50
5,Silent Hill,,Horror,110.0,15


In [6]:
#Feature Engineering

In [7]:
#1. Date Extraction

In [8]:
#We convert the string dates into datetime objects to extract the Month and Day of Week. 
#This helps the model learn about "Blockbuster seasons"

In [9]:
df['Release_Date'] = pd.to_datetime(df['Release_Date'])

# Extracting Month (1-12) and Day of Week (0=Mon, 6=Sun)
df['Release_Month'] = df['Release_Date'].dt.month
df['Release_Day'] = df['Release_Date'].dt.dayofweek

# Fill missing dates with the most common month (Mode)
df['Release_Month'] = df['Release_Month'].fillna(df['Release_Month'].mode()[0])

In [20]:
df

Unnamed: 0,Title,Release_Date,Genre,Runtime_Min,Budget_M,Release_Month,Release_Day
0,Action Hero,2023-05-12,Action,120.0,100,5.0,4.0
1,Love in Paris,2023-02-14,Romance,95.0,20,2.0,1.0
2,Space Wars,2023-12-15,Sci-Fi|Action,150.0,250,12.0,4.0
3,The Joke,2023-07-04,Comedy,85.0,10,7.0,1.0
4,Mystery Island,2023-10-31,Mystery,110.0,50,10.0,1.0
5,Silent Hill,NaT,Horror,110.0,15,2.0,


In [11]:
#2. Runtime Categorization (Binning)

In [12]:
#runtimes into "Short", "Standard", and "Long" using np.select.

In [25]:
# Handle missing runtime first
df['Runtime_Min'] = df['Runtime_Min'].fillna(df['Runtime_Min'].median())

conditions = [
    (df['Runtime_Min'] < 100),
    (df['Runtime_Min'] >= 100) & (df['Runtime_Min'] <= 130),
    (df['Runtime_Min'] > 130)
]
labels = ['Short', 'Standard', 'Epic']

df= np.select(conditions, labels)

TypeError: Choicelist and default value do not have a common dtype: The DType <class 'numpy.dtypes._PyLongDType'> could not be promoted by <class 'numpy.dtypes.StrDType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.StrDType'>, <class 'numpy.dtypes.StrDType'>, <class 'numpy.dtypes.StrDType'>, <class 'numpy.dtypes._PyLongDType'>)

In [19]:
df

Unnamed: 0,Title,Release_Date,Genre,Runtime_Min,Budget_M,Release_Month,Release_Day
0,Action Hero,2023-05-12,Action,120.0,100,5.0,4.0
1,Love in Paris,2023-02-14,Romance,95.0,20,2.0,1.0
2,Space Wars,2023-12-15,Sci-Fi|Action,150.0,250,12.0,4.0
3,The Joke,2023-07-04,Comedy,85.0,10,7.0,1.0
4,Mystery Island,2023-10-31,Mystery,110.0,50,10.0,1.0
5,Silent Hill,NaT,Horror,110.0,15,2.0,


In [26]:
#3. Handling Multi-Genre Strings

#Notice that "Space Wars" is Sci-Fi|Action. We can use .str.get_dummies() to create a column for every genre found in the data.

In [27]:
# This creates a 1 or 0 for every unique genre found
genre_dummies = df['Genre'].str.get_dummies(sep='|')
df = pd.concat([df, genre_dummies], axis=1)

In [28]:
df

Unnamed: 0,Title,Release_Date,Genre,Runtime_Min,Budget_M,Release_Month,Release_Day,Action,Comedy,Horror,Mystery,Romance,Sci-Fi
0,Action Hero,2023-05-12,Action,120.0,100,5.0,4.0,1,0,0,0,0,0
1,Love in Paris,2023-02-14,Romance,95.0,20,2.0,1.0,0,0,0,0,1,0
2,Space Wars,2023-12-15,Sci-Fi|Action,150.0,250,12.0,4.0,1,0,0,0,0,1
3,The Joke,2023-07-04,Comedy,85.0,10,7.0,1.0,0,1,0,0,0,0
4,Mystery Island,2023-10-31,Mystery,110.0,50,10.0,1.0,0,0,0,1,0,0
5,Silent Hill,NaT,Horror,110.0,15,2.0,,0,0,1,0,0,0
