# Regression w/o Budget
This notebook will is a test to see how our regression performs w/o a budget. This will be a building block to moving to a Neural Network for ROI predictions


### Import All Requirements

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from category_encoders import TargetEncoder
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### 1. Read the CSV file into a pandas dataframe

In [2]:
data = pd.read_csv("../data/super_regression/filtered_data.csv")

# Uncomment the following lines to checkout the first few rows of the DataFrame and learn more about its structure
print(data.head())
print(data.columns)

   tmdb_id  adult  belongs_to_collection  collection_id     budget  \
0       13  False                  False            NaN   55000000   
1   216015  False                   True       344830.0   40000000   
2      621  False                   True        86083.0    6000000   
3   800939  False                  False            NaN   20000000   
4    64682  False                  False            NaN  105000000   

                             genres  \
0    ['Comedy', 'Drama', 'Romance']   
1  ['Drama', 'Romance', 'Thriller']   
2             ['Romance', 'Comedy']   
3    ['Drama', 'Comedy', 'Romance']   
4              ['Drama', 'Romance']   

                                            overview  production_company_id  \
0  A man with a low IQ has accomplished great thi...                    4.0   
1  When college senior Anastasia Steele steps in ...                10146.0   
2  Australian good girl Sandy and greaser Danny f...                    4.0   
3  Divorced couple Georgia a

### 2. Pre-Process the Data
- One Hot Encoding for Genres
- Target Encoding for Production Company Names
- Target Encoding for Actor Names and Director Names

In [3]:
# Preprocessing
# One-hot encoding genres
genres = pd.get_dummies(data['genres'].apply(eval).apply(pd.Series), prefix='', prefix_sep='').sum(axis=0)



# Target encoding production_company_name
target_encoder = TargetEncoder()
data['production_company_name_encoded'] = target_encoder.fit_transform(data['production_company_name'], data['revenue'])

# Target encoding actor names and director name
for column in ['actor_0_name', 'actor_1_name', 'actor_2_name', 'director']:
    target_encoder = TargetEncoder()
    data[f'{column}_encoded'] = target_encoder.fit_transform(data[column], data['revenue'])


### 3. Select the relevant features

In [22]:
print(data.columns)
# Remove budget for no budget
features = ['budget', 'belongs_to_collection']
# Uncomment the Line Below and comment the line above to remove budget
# features = []
genres_df = pd.DataFrame(genres)
# features.extend(genres_df.columns)
features.extend(['release_month', 'release_year', 'production_company_name_encoded'])
features.extend(['actor_0_name_encoded', 'actor_1_name_encoded', 'actor_2_name_encoded', 'director_encoded'])
features.extend(['Gender Ratio', 'Average Actor Age'])

# Assuming 'filtered_data.csv' as the desired filename for the CSV file
csv_filename = 'target_encoded_data.csv'

# Convert the filtered DataFrame back to a CSV file
data.to_csv(csv_filename, index=False)


print(features)



Index(['tmdb_id', 'adult', 'belongs_to_collection', 'collection_id', 'budget',
       'genres', 'overview', 'production_company_id',
       'production_company_name', 'release_month', 'release_year', 'revenue',
       'tagline', 'director', 'actor_0_name', 'actor_0_id', 'actor_0_gender',
       'actor_1_name', 'actor_1_id', 'actor_1_gender', 'actor_2_name',
       'actor_2_id', 'actor_2_gender', 'actor_3_name', 'actor_3_id',
       'actor_3_gender', 'actor_4_name', 'actor_4_id', 'actor_4_gender',
       'actor_birthdays', 'Gender Ratio', 'Average Actor Age',
       'production_company_name_encoded', 'actor_0_name_encoded',
       'actor_1_name_encoded', 'actor_2_name_encoded', 'director_encoded'],
      dtype='object')
['budget', 'belongs_to_collection', 'release_month', 'release_year', 'production_company_name_encoded', 'actor_0_name_encoded', 'actor_1_name_encoded', 'actor_2_name_encoded', 'director_encoded', 'Gender Ratio', 'Average Actor Age']


### 3.5 Trying to scale the feautres

In [14]:
# data = pd.read_csv('target_encoded_data.csv')

# # Select the features for scaling
# features_to_scale = ['budget', 'release_month', 'release_year', 'production_company_name_encoded',
#                      'actor_0_name_encoded', 'actor_1_name_encoded', 'actor_2_name_encoded', 'director_encoded',
#                      'Gender Ratio', 'Average Actor Age']

# # Extract the selected features
# selected_features = data[features_to_scale]

# # Initialize the MinMaxScaler
# scaler = MinMaxScaler()

# # Fit and transform the selected features
# scaled_features = scaler.fit_transform(selected_features)

# # Create a DataFrame for the scaled features
# scaled_features_df = pd.DataFrame(scaled_features, columns=features_to_scale)

# # Replace the original features with the scaled features in the data DataFrame
# data[features_to_scale] = scaled_features_df

# # Save the scaled data to a new CSV file
# scaled_csv_filename = 'scaled_target_encoded_data.csv'
# data.to_csv(scaled_csv_filename, index=False)

# # Print the columns of the DataFrame
# print(data.columns)


### 4. Setup the Train-Test Split

In [23]:
# Create an imputer object with strategy 'mean', 'median', or 'most_frequent'
print(features)
X = data[features]
y = data['revenue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


imputer = SimpleImputer(strategy='mean')

# Fit the imputer to the training data
imputer.fit(X_train)

# Transform both the training and testing data
X_train_imputed = imputer.transform(X_train)
X_test_imputed = imputer.transform(X_test)


['budget', 'belongs_to_collection', 'release_month', 'release_year', 'production_company_name_encoded', 'actor_0_name_encoded', 'actor_1_name_encoded', 'actor_2_name_encoded', 'director_encoded', 'Gender Ratio', 'Average Actor Age']


### 5. Train Linear Regression Model

In [24]:
model = LinearRegression()
model.fit(X_train_imputed, y_train)

### 6. Make Predictions

In [25]:
y_pred = model.predict(X_test_imputed)

### 7. Evaluate Model and Print Metrics

In [26]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R-squared (R2) Score:", r2)

Mean Absolute Error: 46216344.620485656
Mean Squared Error: 7543911161448843.0
R-squared (R2) Score: 0.730117394706653


### 8. Print Coefficients

In [28]:
# Print coefficients
coefficients = model.coef_
intercept = model.intercept_

# List of feature names
feature_names = ['budget', 'belongs_to_collection', 'release_month', 'release_year', 'production_company_name_encoded', 
                 'actor_0_name_encoded', 'actor_1_name_encoded', 'actor_2_name_encoded', 
                 'director_encoded', 'Gender Ratio', 'Average Actor Age']

# Print coefficients along with corresponding feature names
print("Feature Coefficients:")
for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef}")

# Print the intercept
print("Intercept:", intercept)


Feature Coefficients:
budget: 1.0900419065909364
belongs_to_collection: 38668097.95435081
release_month: -480159.4585717099
release_year: -301355.90546068427
production_company_name_encoded: 0.2844000919043758
actor_0_name_encoded: 0.4589713886075584
actor_1_name_encoded: 1.3625938063818932
actor_2_name_encoded: 2.6120369337322686
director_encoded: 0.44482266400026105
Gender Ratio: 4157707.370988705
Average Actor Age: -330936.3328224505
Intercept: 264619553.60595745
