## IMDB Regression

In [6]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso,Ridge

In [7]:
df_combined = pd.read_csv("imdb_scraped_data_all.csv")

In [11]:
# X ve Y değişkenlerimizi oluşturma
X = df_combined.loc[:,["Year", "Runtime", "Gross US & Canada", "Votes", "Metascore", "Estimated Revenue", "Budget"]]
y = df_combined["Rating"]

# Train/Test Ayrımı
X_train, x_test, Y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train/Validation Ayrımı
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.25, random_state=0)

In [13]:
# X ve Y değişkenlerimizi oluşturma
X = df_combined.loc[:, ["Year", "Runtime", "Gross US & Canada", "Votes", "Metascore", "Estimated Revenue", "Budget"]]
y = df_combined["Rating"]

# Train/Test Ayrımı
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train/Validation Ayrımı
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=0)


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Sınıflandırma fonksiyonu
def categorize_rating(rating):
    if rating <= 5:
        return "Kötü"
    elif rating <= 7:
        return "Ortalama"
    else:
        return "İyi"

# Hedef değişkeni sınıflandırma
df_combined["Rating_Class"] = df_combined["Rating"].apply(categorize_rating)

# X ve Y değişkenlerini oluşturma
X = df_combined.loc[:,["Year", "Runtime", "Gross US & Canada", "Votes", "Metascore", "Estimated Revenue", "Budget"]]
y = df_combined["Rating_Class"]

# Train/Test Ayrımı
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Modelleri tanımlama
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('Naive Bayes', GaussianNB()))
models.append(('Decision Tree (CART)', DecisionTreeClassifier())) 
models.append(('K-NN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('Gradient Boosting Classifier', GradientBoostingClassifier()))
models.append(('AdaBoostClassifier', AdaBoostClassifier()))
models.append(('BaggingClassifier', BaggingClassifier()))
models.append(('RandomForestClassifier', RandomForestClassifier()))
models.append(('MLPClassifier', MLPClassifier()))

# Modelleri döngü ile deneme
for name, model in models:
    model = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    from sklearn import metrics

    print("%s -> ACC: %%%.2f" % (name, metrics.accuracy_score(y_test, y_pred) * 100))

Logistic Regression -> ACC: %66.53
Naive Bayes -> ACC: %46.65
Decision Tree (CART) -> ACC: %73.33
K-NN -> ACC: %59.53
SVM -> ACC: %63.95
Gradient Boosting Classifier -> ACC: %80.02
AdaBoostClassifier -> ACC: %74.67
BaggingClassifier -> ACC: %77.75
RandomForestClassifier -> ACC: %79.71
MLPClassifier -> ACC: %59.42


In [16]:
# Öncelikle kütüphaneleri import edelim.
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Gradient Boosting Classifier modelini tanımlayalım.
gbk = GradientBoostingClassifier()

# Oynamak istediğiniz parametreler ve aralıklarını belirleyelim.
parameters = {'learning_rate': [0.01, 1, 0.01], 'n_estimators': [0, 500]}

# GridSearchCV ile hiperparametre arama yapacak şekilde yeni bir model oluşturalım.
gbk_grd = GridSearchCV(gbk, parameters, cv=5)

# Yeni modeli eğitelim.
gbk_grd.fit(X_train, y_train)

# En iyi parametreleri ve doğruluk skorunu görelim.
print("En iyi parametreler:", gbk_grd.best_params_)
print("En iyi doğruluk skoru:", gbk_grd.best_score_)

# Test verilerini kullanarak tahmin yapalım.
y_pred = gbk_grd.predict(X_test)

15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Sw4yt\AppData\Roaming\Python\Python38\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Sw4yt\AppData\Roaming\Python\Python38\site-packages\sklearn\ensemble\_gb.py", line 420, in fit
    self._validate_params()
  File "C:\Users\Sw4yt\AppData\Roaming\Python\Python38\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Sw4yt\AppData\Roaming\Python\Python38\site-packages\sklearn\utils\_param_validation.py",

En iyi parametreler: {'learning_rate': 0.01, 'n_estimators': 500}
En iyi doğruluk skoru: 0.8209280340723641


In [17]:
xtest_new = np.array([0,2,1,5,0,1,1,0,0,1,1,0,1,1])
xtest_new = xtest_new.reshape(1, -1)
y_pred = gbk.predict(xtest_new)
proba = gbk.predict_proba(xtest_new)[:,1]
print("My Prediction is:", y_pred, "Probability is:", proba

SyntaxError: unexpected EOF while parsing (714581761.py, line 5)

In [None]:
def categorize_rating(rating):
    if rating <= 5:
        return "Kötü"
    elif rating <= 7:
        return "Ortalama"
    else:
        return "İyi"

df_combined["Rating_Class"] = df_combined["Rating"].apply(categorize_rating)

In [14]:
#burada bütün modelleri ımport ediyoruz.
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

#burada modelleri bir listenin içerisine alıp parametreleri ile beraber tanımlıyoruz.
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('Naive Bayes', GaussianNB()))
models.append(('Decision Tree (CART)',DecisionTreeClassifier())) 
models.append(('K-NN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('Gradient Boosting Classifier', GradientBoostingClassifier()))
models.append(('AdaBoostClassifier', AdaBoostClassifier()))
models.append(('BaggingClassifier', BaggingClassifier()))
models.append(('RandomForestClassifier', RandomForestClassifier()))
models.append(('MLPClassifier', MLPClassifier()))

#burada bir döngü vasıtasıyla tek tek bütün modelleri deneyerek sonuçları karşılaştırıyoruz. 
for name, model in models:
    model = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    from sklearn import metrics

    print("%s -> ACC: %%%.2f" % (name,metrics.accuracy_score(y_test, y_pred)*100))

ValueError: Unknown label type: 'continuous'

In [9]:
print('X Train:', x_train.shape)
print('X Validation:', x_val.shape)
print('X test:', x_test.shape)

X Train: (2910, 7)
X Validation: (971, 7)
X test: (971, 7)


In [113]:
lreg = LinearRegression()

lreg.fit(x_train, y_train)

pred = lreg.predict(x_val)

# MSE Hesabı
mse = np.mean((pred - y_val)**2)
print("MSE: ", mse)

# R2 Skor
print("R2 Score: ", lreg.score(x_val, y_val))

MSE:  0.293127793963753
R2 Score:  0.6423414808677816


In [114]:
# Modeli Oluşturma
model = sm.OLS(y_train, x_train)

# Modeli Eğitme
fit = model.fit()

# Oluşturduğumuz modele ilişkin özet tablosunu görüntüleme
fit.summary()

0,1,2,3
Dep. Variable:,Rating,R-squared (uncentered):,0.993
Model:,OLS,Adj. R-squared (uncentered):,0.993
Method:,Least Squares,F-statistic:,66620.0
Date:,"Mon, 15 May 2023",Prob (F-statistic):,0.0
Time:,14:14:48,Log-Likelihood:,-2437.3
No. Observations:,2910,AIC:,4887.0
Df Residuals:,2904,BIC:,4922.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Year,0.0019,3.22e-05,57.722,0.000,0.002,0.002
Runtime,0.0084,0.001,14.094,0.000,0.007,0.010
Gross US & Canada,-8.673e-10,8.21e-11,-10.566,0.000,-1.03e-09,-7.06e-10
Votes,1.133e-06,6.15e-08,18.431,0.000,1.01e-06,1.25e-06
Metascore,0.0328,0.001,50.175,0.000,0.032,0.034
Estimated Revenue,2.753e-10,9.37e-11,2.939,0.003,9.16e-11,4.59e-10
Budget,-2.434e-09,2.58e-10,-9.419,0.000,-2.94e-09,-1.93e-09

0,1,2,3
Omnibus:,374.58,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,906.139
Skew:,-0.737,Prob(JB):,1.7199999999999998e-197
Kurtosis:,5.303,Cond. No.,1.68e+16


In [91]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Verilerinizi eğitim, doğrulama ve test kümelerine ayırın
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Modelinizi oluşturun
model = LinearRegression()

# Modeli eğitin
model.fit(X_train, y_train)

# Doğrulama verilerini kullanarak model performansını değerlendirin
y_val_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print("MSE:", mse)
print("R2 Score:", r2)

# Test verilerini kullanarak model performansını test edin
y_test_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print("MSE:", mse)
print("R2 Score:", r2)

MSE: 6.880485842604106e-28
R2 Score: 1.0
MSE: 7.476737949256839e-28
R2 Score: 1.0


In [94]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Modeli Oluşturma
model = sm.OLS(y_train, x_train)

# Modeli Eğitme
fit = model.fit()

# Oluşturduğumuz modele ilişkin özet tablosunu görüntüleme
fit.summary()

0,1,2,3
Dep. Variable:,Rating,R-squared (uncentered):,1.0
Model:,OLS,Adj. R-squared (uncentered):,1.0
Method:,Least Squares,F-statistic:,4.587e+23
Date:,"Mon, 15 May 2023",Prob (F-statistic):,0.0
Time:,12:43:21,Log-Likelihood:,60889.0
No. Observations:,2910,AIC:,-121800.0
Df Residuals:,2903,BIC:,-121700.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Year,5.822e-17,1.67e-14,0.003,0.997,-3.27e-14,3.28e-14
Runtime,-4.978e-15,2.18e-13,-0.023,0.982,-4.32e-13,4.22e-13
Budget,-9.315e-10,7.15e-20,-1.3e+10,0.000,-9.32e-10,-9.32e-10
Gross US & Canada,-3.711e-09,2.5e-20,-1.48e+11,0.000,-3.71e-09,-3.71e-09
Votes,-2.995e-17,2.3e-17,-1.302,0.193,-7.51e-17,1.52e-17
Metascore,-0.1000,9.03e-13,-1.11e+11,0.000,-0.100,-0.100
Estimated Worldwide Gross,2.416e-09,3.6e-20,6.71e+10,0.000,2.42e-09,2.42e-09
Score,2.0000,1.31e-11,1.52e+11,0.000,2.000,2.000
Estimated Revenue,-9.315e-10,4.83e-20,-1.93e+10,0.000,-9.32e-10,-9.32e-10

0,1,2,3
Omnibus:,2293.073,Durbin-Watson:,1.262
Prob(Omnibus):,0.0,Jarque-Bera (JB):,52589.919
Skew:,-3.63,Prob(JB):,0.0
Kurtosis:,22.52,Cond. No.,1.65e+16


In [166]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# X ve Y değişkenlerimizi oluşturma
X1 = df_combined.loc[:,["Year", "Runtime", "Gross US & Canada", "Votes", "Metascore"]]
y1 = df_combined["Rating"]

# Train/Test Ayrımı
X_train2, x_test2, Y_train2, y_test2 = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Train/Validation Ayrımı
x_train2, x_val2, y_train2, y_val2 = train_test_split(X_train2, Y_train2, test_size=0.25, random_state=42)

print('X Train:', x_train2.shape)
print('X Validation:', x_val2.shape)
print('X test:', x_test2.shape)

X Train: (2910, 5)
X Validation: (971, 5)
X test: (971, 5)


In [167]:
import pandas as pd
import numpy as np
# Modeli Oluşturma
lreg = LinearRegression()

lreg.fit(x_train2,y_train2)

pred = lreg.predict(x_val2)

# MSE Hesabı
mse = np.mean((pred - y_val2)**2)
print("MSE: ", mse)

# R2 Skor
print("R2 Score: ", lreg.score(x_val2, y_val2))

MSE:  0.29504298633777776
R2 Score:  0.6400046677696976


In [168]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Modeli Oluşturma
model2 = sm.OLS(y_train2, x_train2)

# Modeli Eğitme
fit2 = model2.fit()

# Oluşturduğumuz modele ilişkin özet tablosunu görüntüleme
fit2.summary()

0,1,2,3
Dep. Variable:,Rating,R-squared (uncentered):,0.993
Model:,OLS,Adj. R-squared (uncentered):,0.993
Method:,Least Squares,F-statistic:,78150.0
Date:,"Mon, 15 May 2023",Prob (F-statistic):,0.0
Time:,13:56:17,Log-Likelihood:,-2470.6
No. Observations:,2910,AIC:,4951.0
Df Residuals:,2905,BIC:,4981.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Year,0.0019,3.25e-05,57.033,0.000,0.002,0.002
Runtime,0.0073,0.001,12.462,0.000,0.006,0.008
Gross US & Canada,-1.187e-09,1.77e-10,-6.712,0.000,-1.53e-09,-8.4e-10
Votes,1.118e-06,6.21e-08,17.995,0.000,9.96e-07,1.24e-06
Metascore,0.0342,0.001,53.386,0.000,0.033,0.035

0,1,2,3
Omnibus:,320.128,Durbin-Watson:,2.01
Prob(Omnibus):,0.0,Jarque-Bera (JB):,765.653
Skew:,-0.643,Prob(JB):,5.5e-167
Kurtosis:,5.159,Cond. No.,5960000.0
