# Feature Engineered Regression
This notebook will be feature engineered Regression Model. This will be a building block to moving to a Neural Network for ROI predictions

This is Regression #3 in our 5 part regression series



### Import All Requirements

In [128]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler

### 1. Read the CSV file into a pandas dataframe

In [129]:
df = pd.read_csv("../data/fixed_target_encoding/Encoded_Data.csv")
df.dropna(inplace=True)

# Uncomment the following lines to checkout the first few rows of the DataFrame and learn more about its structure
print(df.head())
print(df.columns)

    tmdb_id  adult  belongs_to_collection  collection_id   budget  \
4       618  False                   True       982426.0   100000   
48    65203  False                   True      1037405.0   379000   
52      143  False                   True      1070848.0  1448864   
54   156320  False                   True       981517.0   337000   
59      138  False                   True       221539.0   355000   

                                             overview  production_company_id  \
4   Two families, abolitionist Northerners the Sto...               161894.0   
48  The vaudeville act of Harriet and Queenie Maho...                   21.0   
52  When a group of idealistic young men join the ...                   33.0   
54  Three department store girls--Connie, Franky, ...                   21.0   
59  British estate agent Renfield travels to Trans...                   33.0   

   production_company_name  release_month  release_year  ...  Action  \
4           Epoch Film Co.      

In [130]:
print(df.columns)

Index(['tmdb_id', 'adult', 'belongs_to_collection', 'collection_id', 'budget',
       'overview', 'production_company_id', 'production_company_name',
       'release_month', 'release_year', 'revenue', 'tagline', 'director',
       'actor_0_name', 'actor_0_id', 'actor_0_gender', 'actor_1_name',
       'actor_1_id', 'actor_1_gender', 'actor_2_name', 'actor_2_id',
       'actor_2_gender', 'actor_3_name', 'actor_3_id', 'actor_3_gender',
       'actor_4_name', 'actor_4_id', 'actor_4_gender', 'actor_birthdays',
       'Gender Ratio', 'Average Actor Age', 'runtime', 'Crime', 'Drama', 'War',
       'Western', 'Family', 'Thriller', 'Fantasy', 'Mystery', 'Animation',
       'Music', 'Romance', 'Adventure', 'History', 'Science Fiction', 'Comedy',
       'TV Movie', 'Documentary', 'Horror', 'Action', 'actor0_rev',
       'actor0_movies', 'actor1_rev', 'actor1_movies', 'actor2_rev',
       'actor2_movies', 'actor_0_name_encoded', 'actor_1_name_encoded',
       'actor_2_name_encoded'],
      dtype='

### 2. Setup the Dataset

In [131]:
# Select the features (X) and the target variable (y)
# features = ['belongs_to_collection', 'budget', 'release_month', 'release_year', 'Gender Ratio', 'Average Actor Age', 'runtime', 'Crime', 'Drama', 'War',
#        'Western', 'Family', 'Thriller', 'Fantasy', 'Mystery', 'Animation',
#        'Music', 'Romance', 'Adventure', 'History', 'Science Fiction', 'Comedy',
#        'TV Movie', 'Documentary', 'Horror', 'Action', 'actor_0_name_encoded', 'actor_1_name_encoded',
#        'actor_2_name_encoded']

features = ['belongs_to_collection', 'budget', 'release_month', 'release_year', 'Gender Ratio', 'Average Actor Age', 'runtime', 'actor_0_name_encoded', 'actor_1_name_encoded',
        'actor_2_name_encoded']

# features = ['belongs_to_collection', 'release_month', 'release_year', 'Gender Ratio', 'Average Actor Age']


df['release_month_sin'] = np.sin(2 * np.pi * df['release_month'] / 12)
df['release_month_cos'] = np.cos(2 * np.pi * df['release_month'] / 12)

# Select features and target variable
features = ['belongs_to_collection', 'budget', 'release_year', 'Gender Ratio', 
            'Average Actor Age', 'runtime', 'actor_0_name_encoded', 
            'actor_1_name_encoded', 'actor_2_name_encoded', 'release_month_sin', 'release_month_cos']

X = df[features]  # Assuming 'budget' is the independent variable
y = df['revenue']   # Assuming 'revenue' is the dependent variable


# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### 3. Create and Fit the Model

In [132]:
# Create and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

### 4. Make Predictions

In [133]:
# Make predictions
y_pred = model.predict(X_test)

### 6. Some Results

In [134]:
# Print the coefficients
feature_names = X_train.columns

# Print each feature and its corresponding weight
for feature, weight in zip(feature_names, model.coef_):
    print(f"{feature}: {weight}")

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error:', mae)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)

# Calculate R-squared (R2) score
r2 = r2_score(y_test, y_pred)
print('R-squared (R2) Score:', r2)

belongs_to_collection: 0.0
budget: 3.214322106288739
release_year: -723524.6170027844
Gender Ratio: 20941730.058218025
Average Actor Age: -192405.8100943877
runtime: 951885.6570830598
actor_0_name_encoded: 0.3072922204402372
actor_1_name_encoded: 0.03025250317283175
actor_2_name_encoded: 0.19301329194338876
release_month_sin: 7113955.896594564
release_month_cos: -3523012.5905361474
Mean Absolute Error: 81558311.42382354
Mean Squared Error: 2.2824700502284256e+16
R-squared (R2) Score: 0.6696466527097154
