In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

In [27]:
# Load the dataset
data = pd.read_csv('../data/Final_Regression.csv')

In [28]:
# Select the features and target variable
features = ['budget', 'release_month', 'release_year', 
            'belongs_to_collection', 'actor_0_name_encoded', 'actor_1_name_encoded', 
            'actor_2_name_encoded', 'Gender Ratio', 'Average Actor Age', 
            'runtime', 'production_company_name_encoded', 'director_encoded']
# Uncomment the genres you want to include
# features += ['Crime', 'Drama', 'War', 'Western', 'Family', 'Thriller', 
#              'Fantasy', 'Mystery', 'Animation', 'Music', 'Romance', 
#              'Adventure', 'History', 'Science Fiction', 'Comedy', 
#              'TV Movie', 'Documentary', 'Horror', 'Action']

# Get the names of the features
feature_names = features
target = 'revenue'

In [29]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

In [30]:
# Check for missing values in the training data
missing_values = X_train.isnull().sum()
print("Missing values in training data:")
print(missing_values)

# Use SimpleImputer to fill in missing values
from sklearn.impute import SimpleImputer

# Instantiate the imputer
imputer = SimpleImputer(strategy='mean')  # You can choose different strategies like 'mean', 'median', or 'most_frequent'

# Fit the imputer to the training data
imputer.fit(X_train)

Missing values in training data:
budget                             0
release_month                      6
release_year                       6
belongs_to_collection              0
actor_0_name_encoded               0
actor_1_name_encoded               0
actor_2_name_encoded               0
Gender Ratio                       0
Average Actor Age                  0
runtime                            0
production_company_name_encoded    0
director_encoded                   0
dtype: int64


In [31]:
# Transform the training and testing data
X_train_imputed = imputer.transform(X_train)
X_test_imputed = imputer.transform(X_test)

model = LinearRegression()
# Now, train the model on the imputed data
model.fit(X_train_imputed, y_train)

# Make predictions
y_pred_imputed = model.predict(X_test_imputed)

In [32]:
# Print the mean squared error and R-squared value after imputation
print("Mean Squared Error (after imputation):", mean_squared_error(y_test, y_pred_imputed))
print("R-squared (after imputation):", r2_score(y_test, y_pred_imputed))

Mean Squared Error (after imputation): 5091849504774992.0
R-squared (after imputation): 0.7643636564336351


In [33]:
# Get the names of the features
feature_names = X_train.columns

# Print each feature and its corresponding weight
for feature, weight in zip(feature_names, model.coef_):
    print(f"{feature}: {weight}")


budget: 1.079683409890444
release_month: -574399.303223075
release_year: -328011.86897834076
belongs_to_collection: 39714396.69531195
actor_0_name_encoded: 0.43067108921004155
actor_1_name_encoded: 1.451908227270505
actor_2_name_encoded: 2.624095295439769
Gender Ratio: 13914023.169674013
Average Actor Age: -369481.8071413869
runtime: 294631.9066738574
production_company_name_encoded: 0.2553424135464387
director_encoded: 0.5255275600332637
