# المكاتب وتحميل الداتا 

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from collections import Counter
from itertools import combinations

from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    BaggingRegressor,
    RandomForestClassifier,
    RandomForestRegressor,
    GradientBoostingRegressor
)
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from tabulate import tabulate

df = pd.read_csv(r"C:\Users\asus\Desktop\5\2\ML\project\data\lNH_data.csv")



In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37093 entries, 0 to 37092
Data columns (total 44 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        37093 non-null  int64  
 1   title                     37093 non-null  object 
 2   genres_x                  37093 non-null  object 
 3   original_language         37093 non-null  object 
 4   overview                  37093 non-null  object 
 5   popularity                37093 non-null  float64
 6   production_companies      37093 non-null  object 
 7   release_date              37093 non-null  object 
 8   budget                    37093 non-null  float64
 9   revenue                   37093 non-null  float64
 10  runtime                   37093 non-null  float64
 11  status                    37093 non-null  object 
 12  tagline                   37093 non-null  object 
 13  vote_average              37093 non-null  float64
 14  vote_c

In [None]:
print("genres_x:", df['genres_x'].notna().sum())
print("genres_y:", df['genres_y'].notna().sum())



genres_x: 37093
genres_y: 25876


In [4]:
df.status.value_counts() 

status
Released    37093
Name: count, dtype: int64

# تجربة 1
 target="profit" or target="revenue"

In [None]:

drop_cols = [
    'id', 'title', 'tconst', 'primaryTitle', 'originalTitle',
    'normalized_title', 'normalized_primary_title',
    'release_date',
    'original_language',
    'runtimeMinutes',
    'startYear',
    'genres_y', 
    'status',  
    'titleType',   
]

text_features = [
    'genres_x', 'overview', 'production_companies',
    'credits', 'normalized_credits_list','tagline'
]
df = df.drop(columns=drop_cols)

target1 = df['profit']
target2=df['revenue']
from sklearn.model_selection import train_test_split

# === 0. تعريف الأهداف ===
target1 = df['profit']
target2 = df['revenue']
df1 = df.drop(columns=['profit','revenue'])
df2 = df.drop(columns=['profit','revenue'])

# === 1. أول تقسيم: Train+Temp (60% + 40%) ===
X_train1, X_temp1, Y_train1, Y_temp1 = train_test_split(
    df1, target1,
    test_size=0.4,
    shuffle=True,
    random_state=42
)

X_train2, X_temp2, Y_train2, Y_temp2 = train_test_split(
    df2, target2,
    test_size=0.4,
    shuffle=True,
    random_state=42
)

# === 2. ثاني تقسيم: Validation + Test (كل منهما 20%) من Temp ===
X_valid1, X_test1, Y_valid1, Y_test1 = train_test_split(
    X_temp1, Y_temp1,
    test_size=0.5,
    shuffle=True,
    random_state=42
)

X_valid2, X_test2, Y_valid2, Y_test2 = train_test_split(
    X_temp2, Y_temp2,
    test_size=0.5,
    shuffle=True,
    random_state=42
)



# models
 1) Linear Regression
 2) Decision Tree
 3) Bagging
 4) Random Forest
 5) XGBoost
 6) Gradient Boosting
7) LightGBM
8) CatBoost


In [None]:

# === 1. إعداد الأعمدة النصية ===
for col in text_features:
    df1[col] = df1[col].fillna('')
   

categorical_features1 = [col for col in df1.columns if df1[col].dtype == 'object' and col not in text_features]
numeric_features1 = [col for col in df1.columns if df1[col].dtype in ['int64', 'float64']]

categorical_features2 = [col for col in df2.columns if df2[col].dtype == 'object' and col not in text_features]
numeric_features2 = [col for col in df2.columns if df2[col].dtype in ['int64', 'float64']]

# === 2. دالة اختيار نصوص
def text_column_selector(X):
    return X.squeeze()

# === 3. إعداد الـ Preprocessor
def make_preprocessor(categorical_features, numeric_features):
    transformers = []
    for col in text_features:
        transformers.append((
            col,
            Pipeline([
                ('selector', FunctionTransformer(text_column_selector, validate=False)),
                ('tfidf', TfidfVectorizer(max_features=100))
            ]),
            col
        ))

    transformers.append((
        'cat',
        Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]),
        categorical_features
    ))

    transformers.append((
        'num',
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean'))
        ]),
        numeric_features
    ))

    return ColumnTransformer(transformers=transformers)

preprocessor1 = make_preprocessor(categorical_features1, numeric_features1)
preprocessor2 = make_preprocessor(categorical_features2, numeric_features2)

# === 4. النماذج
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Bagging": BaggingRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42, eval_metric='rmse'),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(random_state=42, verbose=0)
}

# === 5. دالة التقييم
def evaluate_models(X_train, Y_train, X_valid, Y_valid, preprocessor):
    results = []
    trained_models = {}
    for name, model in models.items():
        pipeline = Pipeline([
            ('preprocess', preprocessor),
            ('model', model)
        ])
        pipeline.fit(X_train, Y_train)
        preds = pipeline.predict(X_valid)
        r2 = r2_score(Y_valid, preds)
        results.append({"Model": name, "R2 Score": r2})
        trained_models[name] = pipeline
        print(f" {name} done with R2: {r2:.4f}")
    return pd.DataFrame(results), trained_models


# === 7. تدريب على الإيرادات
print("\n=== Results on Revenue Prediction ===")
results_revenue, trained_revenue_models = evaluate_models(X_train2, Y_train2, X_valid2, Y_valid2, preprocessor2)
print(tabulate(results_revenue, headers="keys", tablefmt="grid", showindex=False))
# === 7. تدريب على الربح
print("\n=== Results on Profit Prediction ===")
results_profit, trained_profit_models = evaluate_models(X_train1, Y_train1, X_valid1, Y_valid1, preprocessor1)
print(tabulate(results_profit, headers="keys", tablefmt="grid", showindex=False))

# === 8. تقييم أفضل موديل على test set
best_profit_model_name = results_profit.sort_values('R2 Score', ascending=False).iloc[0]['Model']
best_revenue_model_name = results_revenue.sort_values('R2 Score', ascending=False).iloc[0]['Model']

print(f"\n Testing best Profit model ({best_profit_model_name})")
best_profit_model = trained_profit_models[best_profit_model_name]
profit_test_r2 = r2_score(Y_test1, best_profit_model.predict(X_test1))

print(f"\n Testing best Revenue model ({best_revenue_model_name})")
best_revenue_model = trained_revenue_models[best_revenue_model_name]
revenue_test_r2 = r2_score(Y_test2, best_revenue_model.predict(X_test2))

print("\n=== Final Test Set R2 Scores ===")
test_results = [
    ["Profit", best_profit_model_name, f"{profit_test_r2:.4f}"],
    ["Revenue", best_revenue_model_name, f"{revenue_test_r2:.4f}"]
]
print(tabulate(test_results, headers=["Target", "Best Model", "Test R2"], tablefmt="fancy_grid"))



=== Results on Revenue Prediction ===
 Linear Regression done with R2: 0.7785
 Decision Tree done with R2: 0.6171
 Bagging done with R2: 0.7969
 Random Forest done with R2: 0.8224
 XGBoost done with R2: 0.7909
 Gradient Boosting done with R2: 0.7818
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014855 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 80601
[LightGBM] [Info] Number of data points in the train set: 22255, number of used features: 715
[LightGBM] [Info] Start training from score 14363940.266412




 LightGBM done with R2: 0.8018
 CatBoost done with R2: 0.8239
+-------------------+------------+
| Model             |   R2 Score |
| Linear Regression |   0.778514 |
+-------------------+------------+
| Decision Tree     |   0.61707  |
+-------------------+------------+
| Bagging           |   0.796855 |
+-------------------+------------+
| Random Forest     |   0.822381 |
+-------------------+------------+
| XGBoost           |   0.79086  |
+-------------------+------------+
| Gradient Boosting |   0.78178  |
+-------------------+------------+
| LightGBM          |   0.801839 |
+-------------------+------------+
| CatBoost          |   0.823908 |
+-------------------+------------+

=== Results on Profit Prediction ===
 Linear Regression done with R2: 0.6553
 Decision Tree done with R2: 0.4496
 Bagging done with R2: 0.6928
 Random Forest done with R2: 0.7254
 XGBoost done with R2: 0.6695
 Gradient Boosting done with R2: 0.6784
[LightGBM] [Info] Auto-choosing col-wise multi-threading, 



 LightGBM done with R2: 0.6818
 CatBoost done with R2: 0.7327
+-------------------+------------+
| Model             |   R2 Score |
| Linear Regression |   0.655346 |
+-------------------+------------+
| Decision Tree     |   0.449648 |
+-------------------+------------+
| Bagging           |   0.692846 |
+-------------------+------------+
| Random Forest     |   0.725412 |
+-------------------+------------+
| XGBoost           |   0.669539 |
+-------------------+------------+
| Gradient Boosting |   0.678401 |
+-------------------+------------+
| LightGBM          |   0.681772 |
+-------------------+------------+
| CatBoost          |   0.732668 |
+-------------------+------------+

 Testing best Profit model (CatBoost)

 Testing best Revenue model (CatBoost)

=== Final Test Set R2 Scores ===
╒══════════╤══════════════╤═══════════╕
│ Target   │ Best Model   │   Test R2 │
╞══════════╪══════════════╪═══════════╡
│ Profit   │ CatBoost     │    0.7096 │
├──────────┼──────────────┼────────

In [7]:

results_folder = r"C:\Users\asus\Desktop\5\2\ML\project\results"
os.makedirs(results_folder, exist_ok=True)

results_profit.to_csv(os.path.join(results_folder, "results_profit.csv"), index=False)
results_revenue.to_csv(os.path.join(results_folder, "results_revenue.csv"), index=False)

test_results_df = pd.DataFrame(test_results, columns=["Target", "Best Model", "Test R2"])
test_results_df.to_csv(os.path.join(results_folder, "test_results.csv"), index=False)

print(" الملفات تم حفظها في:", results_folder)


 الملفات تم حفظها في: C:\Users\asus\Desktop\5\2\ML\project\results


اعتمدنا على التنبؤ بالايرادات لان نتائجها افضل 

<h3>يوجد ايرادات  وميزانيات صفرية كبيرة جدا تقريبا 80% من الداتا </h3>

In [8]:
df['revenue'].value_counts()


revenue
0.0           29594
1000000.0        26
2000000.0        24
3000000.0        21
11000000.0       20
              ...  
42300873.0        1
31155435.0        1
28623900.0        1
12376625.0        1
524.0             1
Name: count, Length: 6770, dtype: int64

In [9]:
df[df['revenue'] == 0].sample(5)

Unnamed: 0,genres_x,overview,popularity,production_companies,budget,revenue,runtime,tagline,vote_average,vote_count,...,runtimeCategory,averageRating,movie_age,rating_category,age_group,has_superstar_actor,movie_oscar,normalized_credits_list,movie_credits_oscar,company_oscars
8143,Music-Drama,A troubled and angry 11-year-old orphan from a...,9.299,Informant Media-Informant Films,0.0,0.0,106.0,Extraordinary talent needs extraordinary inspi...,6.8,243.0,...,standard,6.7,11.0,Good,11–20 yrs,False,0,"['dustinhoffman', 'kevinmchale', 'joshlucas', ...",2,0
23981,Comedy,A woman screenwriter lives in a shabby bungalo...,1.834,Columbia Pictures,0.0,0.0,102.0,"Three delightful players in a comedy modern, f...",4.7,3.0,...,standard,6.3,80.0,Average,51–100 yrs,False,0,"['irenedunne', 'alexanderknox', 'charlescoburn...",2,104
8410,Crime-Drama-Thriller-Mystery,Tells the seemingly random yet vitally connect...,9.063,Firm Films-Media 8 Entertainment-MDP Worldwide,6000000.0,0.0,86.0,Fate can change in seconds.,6.805,692.0,...,standard,7.1,22.0,Good,21–50 yrs,False,0,"['rachaelleighcook', 'hilaryswank', 'shawnhato...",2,0
26197,Western,A hired hand gets caught between a noble ranch...,1.481,Allied Artists Pictures-Scott R. Dunlap Produc...,0.0,0.0,82.0,Land Of Lawless Living...And Violence By Vigil...,5.1,5.0,...,standard,6.2,72.0,Average,51–100 yrs,False,0,"[""edmondo'brien"", 'helenwestcott', 'robertlowe...",0,0
20417,Action-Comedy-Western,Packaged and sold as an outdoor actioner Many ...,2.533,Metro-Goldwyn-Mayer,0.0,0.0,95.0,KENTUCKY ADVENTURE in CINEMASCOPE,6.4,9.0,...,standard,6.2,70.0,Average,51–100 yrs,False,0,"['roberttaylor', 'eleanorparker', 'victormclag...",2,258


In [10]:

df['budget'].value_counts()

budget
0.0           27654
10000000.0      284
1000000.0       270
20000000.0      261
5000000.0       249
              ...  
7834998.0         1
1271000.0         1
575.0             1
575375.0          1
112000.0          1
Name: count, Length: 1221, dtype: int64

In [11]:
df.profit.value_counts()

profit
 0.0           25991
-1000000.0       199
-2000000.0       121
-3000000.0       120
-500000.0        115
               ...  
 45200903.0        1
 30509925.0        1
 841334.0          1
-6198245.0         1
-112000.0          1
Name: count, Length: 7526, dtype: int64

# تجربة 2
target=  Log("revenue">0)  لجعل توزيع الداتا طبيعي اكثر 
 # (Log + Clip <1 to 0)

In [13]:


for col in text_features:
    df2[col] = df2[col].fillna('')

categorical_features2 = [col for col in df2.columns if df2[col].dtype == 'object' and col not in text_features]
numeric_features2 = [col for col in df2.columns if df2[col].dtype in ['int64', 'float64']]

def text_column_selector(X):
    return X.squeeze()

def make_preprocessor(categorical_features, numeric_features):
    transformers = []

    for col in text_features:
        transformers.append((
            col,
            Pipeline([
                ('selector', FunctionTransformer(text_column_selector, validate=False)),
                ('tfidf', TfidfVectorizer(max_features=50))
            ]),
            col
        ))

    transformers.append((
        'cat',
        Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]),
        categorical_features
    ))

    transformers.append((
        'num',
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean'))
        ]),
        numeric_features
    ))

    return ColumnTransformer(transformers=transformers)

# ========== تقسيم البيانات ==========
X_train, X_test, y_train, y_test = train_test_split(df2, target2, test_size=0.3, random_state=42)

# فقط للإيرادات الإيجابية
train_mask = y_train > 0
X_train_reg = X_train[train_mask]
y_train_reg = np.log1p(y_train[train_mask])

# تجهيز الـ preprocessor
preprocessor = make_preprocessor(categorical_features2, numeric_features2)
preprocessor.fit(X_train)

# تحويل البيانات
X_train_trans = preprocessor.transform(X_train_reg)
X_test_trans  = preprocessor.transform(X_test)

# ========== الموديلات ==========
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Bagging": BaggingRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42, eval_metric='rmse'),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(random_state=42, verbose=0)
}

# ========== تدريب وتقييم ==========
results = []

for name, model in models.items():
    model.fit(X_train_trans, y_train_reg)

    predicted_revenues = []
    for i in range(X_test.shape[0]):
        x_trans = X_test_trans[i].reshape(1, -1)
        pred_log = model.predict(x_trans)[0]
        pred = np.expm1(pred_log)
        predicted_revenues.append(0 if pred < 1 else pred)

    y_true = y_test.values
    y_pred = np.array(predicted_revenues)

    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)

    results.append({
        'Model': name,
        'R²': round(r2, 4),
        'MAE': round(mae, 2),
    })

print("\n Evaluation Metrics (Log + Clip <1 to 0):")
print(tabulate(results, headers='keys', tablefmt='grid'))

# ========== اختيار أفضل موديل ==========
best_model_name = sorted(results, key=lambda x: x['R²'], reverse=True)[0]['Model']
best_model = models[best_model_name]

print(f"\n Best Model: {best_model_name}")

best_model.fit(X_train_trans, y_train_reg)

# ========== التنبؤ النهائي ==========
predicted_revenues = []
for i in range(X_test.shape[0]):
    x_trans = X_test_trans[i].reshape(1, -1)
    pred_log = best_model.predict(x_trans)[0]
    pred = np.expm1(pred_log)
    predicted_revenues.append(0 if pred < 1 else pred)

preds = np.array(predicted_revenues)

# ========== حفظ النتائج ==========
final_results = pd.DataFrame({
    'ID': X_test.index,
    'True Revenue': y_test.values,
    'Predicted Revenue': preds,
    'Absolute Error': abs(y_test.values - preds),
    'Relative Error (%)': np.where(
        y_test.values == 0,
        np.nan,
        abs(y_test.values - preds) / y_test.values * 100
    )
})

final_results.to_csv('final_test_predictions_final12.csv', index=False)
print("\n تم حفظ النتائج في ملف: final_test_predictions_final.csv")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004664 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32488
[LightGBM] [Info] Number of data points in the train set: 5205, number of used features: 365
[LightGBM] [Info] Start training from score 16.000757





 Evaluation Metrics (Log + Clip <1 to 0):
+-------------------+-------------+-------------+
| Model             |          R² |         MAE |
| Linear Regression | -33606.5    | 3.59024e+08 |
+-------------------+-------------+-------------+
| Decision Tree     |      0.6436 | 1.0545e+07  |
+-------------------+-------------+-------------+
| Bagging           |      0.761  | 7.49909e+06 |
+-------------------+-------------+-------------+
| Random Forest     |      0.7801 | 7.12493e+06 |
+-------------------+-------------+-------------+
| XGBoost           |      0.7618 | 7.65894e+06 |
+-------------------+-------------+-------------+
| Gradient Boosting |      0.7192 | 7.3746e+06  |
+-------------------+-------------+-------------+
| LightGBM          |      0.7655 | 7.06569e+06 |
+-------------------+-------------+-------------+
| CatBoost          |      0.7737 | 6.99322e+06 |
+-------------------+-------------+-------------+

 Best Model: Random Forest

 تم حفظ النتائج في ملف: fina

  abs(y_test.values - preds) / y_test.values * 100


# تجربة 3
# Conditional Regression after Binary Classification

In [16]:
# --- إعداد النصوص ---
for col in text_features:
    df2[col] = df2[col].fillna('')

# --- تحديد الأعمدة التصنيفية والرقمية ---
categorical_features2 = [col for col in df2.columns if df2[col].dtype == 'object' and col not in text_features]
numeric_features2 = [col for col in df2.columns if df2[col].dtype in ['int64', 'float64']]

def text_column_selector(X):
    return X.squeeze()

def make_preprocessor(categorical_features, numeric_features):
    transformers = []

    for col in text_features:
        transformers.append((
            col,
            Pipeline([
                ('selector', FunctionTransformer(text_column_selector, validate=False)),
                ('tfidf', TfidfVectorizer(max_features=100))
            ]),
            col
        ))

    transformers.append((
        'cat',
        Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]),
        categorical_features
    ))

    transformers.append((
        'num',
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean'))
        ]),
        numeric_features
    ))

    return ColumnTransformer(transformers=transformers)

# --- تعريف موديلات الريجريشن ---
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Bagging": BaggingRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42, eval_metric='rmse'),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(random_state=42, verbose=0)
}

print("\n===  Two-Stage Revenue Prediction: Classification + Regression ===")

# --- 1. تجهيز تصنيف الإيراد ---
target2_class = (target2 > 0).astype(int)

# تقسيم البيانات: تدريب - تحقق - اختبار
X_train2, X_temp2, y_class_train, y_class_temp = train_test_split(df2, target2_class, test_size=0.4, random_state=42)
X_valid2, X_test2, y_class_valid, y_class_test = train_test_split(X_temp2, y_class_temp, test_size=0.5, random_state=42)

# --- 2. تدريب موديل التصنيف ---
preprocessor2 = make_preprocessor(categorical_features2, numeric_features2)

classifier_pipeline = Pipeline([
    ('preprocess', preprocessor2),
    ('model', LogisticRegression(random_state=42, max_iter=1000))
])
classifier_pipeline.fit(X_train2, y_class_train)

# تقييم التصنيف
y_class_pred = classifier_pipeline.predict(X_valid2)
print("\n Classification Report:")
print(classification_report(y_class_valid, y_class_pred))

# --- 3. اختيار صفوف تحقق الإيراد المتوقع ---
has_revenue_valid = y_class_pred == 1
X_valid_reg = X_valid2[has_revenue_valid]
Y_valid_reg = target2.loc[X_valid2.index][has_revenue_valid]

# --- 4. تدريب موديلات الريجريشن على بيانات بها إيراد فقط ---
revenue_mask_train = target2.loc[X_train2.index] > 0
X_train_reg = X_train2[revenue_mask_train]
Y_train_reg = target2.loc[X_train2.index][revenue_mask_train]

# تجهيز preprocessor جديد للريجريشن
preprocessor3 = make_preprocessor(categorical_features2, numeric_features2)
preprocessor3.fit(X_train_reg)

# تحويل البيانات
X_train_reg_trans = preprocessor3.transform(X_train_reg)
X_valid_reg_trans = preprocessor3.transform(X_valid_reg)

results_models = []
trained_models = {}

for name, model in models.items():
    model.fit(X_train_reg_trans, Y_train_reg)
    preds = model.predict(X_valid_reg_trans)
    r2 = r2_score(Y_valid_reg, preds)
    results_models.append({'Model': name, 'R2': r2})
    trained_models[name] = model
    print(f" {name} done with R2: {r2:.2f}")

print("\n Regression Results (only for predicted 'has revenue'):")
print(tabulate(results_models, headers='keys', tablefmt='grid'))

# --- 5. اختبار أفضل موديل على بيانات الاختبار ---
best_model_name = max(results_models, key=lambda x: x['R2'])['Model']
best_model = trained_models[best_model_name]

# توقع التصنيف على بيانات الاختبار
y_class_test_pred = classifier_pipeline.predict(X_test2)
has_revenue_test = y_class_test_pred == 1
X_test_reg = X_test2[has_revenue_test]

# تحويل بيانات الاختبار
X_test_reg_trans = preprocessor3.transform(X_test_reg)

# توقع الإيرادات
predicted_revenues_reg = best_model.predict(X_test_reg_trans)

# بناء قائمة الإيرادات المتوقعة لجميع الصفوف
predicted_revenues = []
j = 0
for i in range(len(X_test2)):
    if has_revenue_test[i]:
        pred = predicted_revenues_reg[j]
        j += 1
    else:
        pred = 0
    predicted_revenues.append(pred)

# --- 6. التقييم النهائي ---
true_revenues = target2.loc[X_test2.index]
abs_error = abs(true_revenues.values - predicted_revenues)
relative_error = abs_error / true_revenues.replace(0, np.nan) * 100
r2_test = r2_score(true_revenues, predicted_revenues)

final_results = pd.DataFrame({
    'True Revenue': true_revenues.values,
    'Predicted Revenue': predicted_revenues,
    'Absolute Error': abs_error,
    'Relative Error (%)': relative_error
})

print("\n===  Final Test Results using Best Model (Two-Stage) ===")
print(tabulate(final_results.head(10), headers='keys', tablefmt='grid'))
print(f"\n Final Test R² Score: {r2_test:.4f}")

# --- 7. حفظ النتائج ---
final_results['ID'] = X_test2.index
final_results.to_csv('final_test_predictions.csv', index=False)
print("\n تم حفظ النتائج في ملف: final_test_predictions.csv")



===  Two-Stage Revenue Prediction: Classification + Regression ===


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      5880
           1       0.90      0.60      0.72      1539

    accuracy                           0.90      7419
   macro avg       0.90      0.79      0.83      7419
weighted avg       0.90      0.90      0.89      7419

 Linear Regression done with R2: 0.72
 Decision Tree done with R2: 0.53
 Bagging done with R2: 0.76
 Random Forest done with R2: 0.77
 XGBoost done with R2: 0.70
 Gradient Boosting done with R2: 0.73
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010547 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 43480
[LightGBM] [Info] Number of data points in the train set: 4468, number of used features: 635
[LightGBM] [Info] Start training from score 71546439.263429




 LightGBM done with R2: 0.74
 CatBoost done with R2: 0.79

 Regression Results (only for predicted 'has revenue'):
+-------------------+----------+
| Model             |       R2 |
| Linear Regression | 0.724279 |
+-------------------+----------+
| Decision Tree     | 0.529569 |
+-------------------+----------+
| Bagging           | 0.761685 |
+-------------------+----------+
| Random Forest     | 0.765805 |
+-------------------+----------+
| XGBoost           | 0.697216 |
+-------------------+----------+
| Gradient Boosting | 0.726372 |
+-------------------+----------+
| LightGBM          | 0.740498 |
+-------------------+----------+
| CatBoost          | 0.794189 |
+-------------------+----------+

===  Final Test Results using Best Model (Two-Stage) ===
+-------+------------------+---------------------+------------------+----------------------+
|       |     True Revenue |   Predicted Revenue |   Absolute Error |   Relative Error (%) |
| 35738 |      0           |         0         

مع لوغاريتم 

In [19]:
# --- إعداد النصوص ---
for col in text_features:
    df2[col] = df2[col].fillna('')

# --- الأعمدة التصنيفية والرقمية ---
categorical_features2 = [col for col in df2.columns if df2[col].dtype == 'object' and col not in text_features]
numeric_features2 = [col for col in df2.columns if df2[col].dtype in ['int64', 'float64']]

def text_column_selector(X):
    return X.squeeze()

def make_preprocessor(categorical_features, numeric_features):
    transformers = []

    for col in text_features:
        transformers.append((
            col,
            Pipeline([
                ('selector', FunctionTransformer(text_column_selector, validate=False)),
                ('tfidf', TfidfVectorizer(max_features=50))
            ]),
            col
        ))

    transformers.append((
        'cat',
        Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]),
        categorical_features
    ))

    transformers.append((
        'num',
        Pipeline([
            ('imputer', SimpleImputer(strategy='mean'))
        ]),
        numeric_features
    ))

    return ColumnTransformer(transformers=transformers)

# === تجهيز البيانات ===
target2_class = (target2 > 0).astype(int)
X_train2, X_temp2, y_class_train, y_class_temp = train_test_split(df2, target2_class, test_size=0.4, random_state=42)
X_valid2, X_test2, y_class_valid, y_class_test = train_test_split(X_temp2, y_class_temp, test_size=0.5, random_state=42)

# === تصنيف ===
preprocessor_clf = make_preprocessor(categorical_features2, numeric_features2)
X_train2_trans = preprocessor_clf.fit_transform(X_train2)
X_valid2_trans = preprocessor_clf.transform(X_valid2)
X_test2_trans  = preprocessor_clf.transform(X_test2)

classifier = LogisticRegression(max_iter=1000, random_state=42)
classifier.fit(X_train2_trans, y_class_train)
y_class_pred = classifier.predict(X_valid2_trans)

print("\n Classification Report:")
print(classification_report(y_class_valid, y_class_pred))

# === بيانات الإيرادات فقط ===
has_revenue_valid = y_class_pred == 1
valid2_index = X_valid2.index[has_revenue_valid]
Y_valid_reg = np.log1p(target2.loc[valid2_index])
X_valid2_reg = X_valid2.loc[valid2_index]

revenue_mask_train = target2.loc[X_train2.index] > 0
train2_index = X_train2.index[revenue_mask_train]
Y_train_reg = np.log1p(target2.loc[train2_index])
X_train2_reg = X_train2.loc[train2_index]

# === تجهيز preprocessor جديد للريجريشن ===
preprocessor_reg = make_preprocessor(categorical_features2, numeric_features2)
X_train_reg_trans = preprocessor_reg.fit_transform(X_train2_reg)
X_valid_reg_trans = preprocessor_reg.transform(X_valid2_reg)

# === تعريف الموديلات ===
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Bagging": BaggingRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42, eval_metric='rmse'),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(random_state=42, verbose=0)
}

results_models = []
trained_models = {}

for name, model in models.items():
    model.fit(X_train_reg_trans, Y_train_reg)
    preds_log = model.predict(X_valid_reg_trans)
    preds = np.expm1(preds_log)
    r2 = r2_score(np.expm1(Y_valid_reg), preds)
    results_models.append({'Model': name, 'R2': r2})
    trained_models[name] = model
    print(f" {name} done with R2: {r2:.2f}")

print("\n Regression Results (only for predicted 'has revenue'):")
print(tabulate(results_models, headers='keys', tablefmt='grid'))

# === أفضل موديل ===
best_model_name = sorted(results_models, key=lambda x: x['R2'], reverse=True)[0]['Model']
best_model = trained_models[best_model_name]

# === اختبار على البيانات الجديدة ===
y_class_test_pred = classifier.predict(X_test2_trans)
has_revenue_test = y_class_test_pred == 1
test2_index = X_test2.index

X_test2_df = X_test2.copy()
X_test2_reg = X_test2_df.loc[test2_index[has_revenue_test]]
X_test2_reg_trans = preprocessor_reg.transform(X_test2_reg)

predicted_revenues_reg = best_model.predict(X_test2_reg_trans)

# === بناء التوقعات الكاملة ===
predicted_revenues = []
j = 0
for i in range(X_test2_trans.shape[0]):
    if has_revenue_test[i]:
        pred = np.expm1(predicted_revenues_reg[j])
        j += 1
    else:
        pred = 0
    predicted_revenues.append(pred)

# === التقييم ===
true_revenues = target2.loc[test2_index]
abs_error = abs(true_revenues.values - predicted_revenues)
relative_error = abs_error / true_revenues.replace(0, np.nan) * 100
r2_test = r2_score(true_revenues, predicted_revenues)

final_results = pd.DataFrame({
    'ID': test2_index,
    'True Revenue': true_revenues.values,
    'Predicted Revenue': predicted_revenues,
    'Absolute Error': abs_error,
    'Relative Error (%)': relative_error
})

print("\n===  Final Test Results using Best Model (Two-Stage with log) ===")
print(tabulate(final_results.head(10), headers='keys', tablefmt='grid'))
print(f"\n Final Test R² Score: {r2_test:.4f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      5880
           1       0.90      0.59      0.72      1539

    accuracy                           0.90      7419
   macro avg       0.90      0.79      0.83      7419
weighted avg       0.90      0.90      0.89      7419

 Linear Regression done with R2: -18939.58
 Decision Tree done with R2: 0.57
 Bagging done with R2: 0.71
 Random Forest done with R2: 0.74
 XGBoost done with R2: 0.66
 Gradient Boosting done with R2: 0.72
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006710 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30675
[LightGBM] [Info] Number of data points in the train set: 4468, number of used features: 385
[LightGBM] [Info] Start training from score 16.026233




 LightGBM done with R2: 0.78
 CatBoost done with R2: 0.77

 Regression Results (only for predicted 'has revenue'):
+-------------------+---------------+
| Model             |            R2 |
| Linear Regression | -18939.6      |
+-------------------+---------------+
| Decision Tree     |      0.570972 |
+-------------------+---------------+
| Bagging           |      0.709042 |
+-------------------+---------------+
| Random Forest     |      0.741208 |
+-------------------+---------------+
| XGBoost           |      0.663857 |
+-------------------+---------------+
| Gradient Boosting |      0.717363 |
+-------------------+---------------+
| LightGBM          |      0.777281 |
+-------------------+---------------+
| CatBoost          |      0.766203 |
+-------------------+---------------+

===  Final Test Results using Best Model (Two-Stage with log) ===
+-------+-------+------------------+---------------------+------------------+----------------------+
|       |    ID |     True Revenu



# تجربة 4 ✅
target="profitable" الفيلم ربح ام لا 
:classfication

In [20]:
df['profitable'] = (df['revenue'] > df['budget']).astype(int)

df=df.drop(columns=['revenue'])


In [21]:
df.columns

Index(['genres_x', 'overview', 'popularity', 'production_companies', 'budget',
       'runtime', 'tagline', 'vote_average', 'vote_count', 'credits',
       'tagline_sentiment', 'profit', 'release_year', 'release_month',
       'release_day', 'unique_actors_count', 'actors_avg_rating',
       'actors_bayesian_rating', 'actors_penalty_rating', 'runtimeCategory',
       'averageRating', 'movie_age', 'rating_category', 'age_group',
       'has_superstar_actor', 'movie_oscar', 'normalized_credits_list',
       'movie_credits_oscar', 'company_oscars', 'profitable'],
      dtype='object')

In [None]:

X = df[['genres_x', 'overview', 'popularity', 'production_companies', 'budget',
       'runtime', 'tagline', 'vote_average', 'vote_count', 'credits',
       'tagline_sentiment', 'profit', 'release_year', 'release_month',
       'release_day', 'unique_actors_count', 'actors_avg_rating',
       'actors_bayesian_rating', 'actors_penalty_rating', 'runtimeCategory',
       'averageRating', 'movie_age', 'rating_category', 'age_group',
       'has_superstar_actor', 'movie_oscar', 'normalized_credits_list',
       'movie_credits_oscar', 'company_oscars']]
y = df['profitable']


In [None]:

counter = Counter(y)
num_losers = counter[0]  # عدد الخاسرين
num_winners = counter[1]  # عدد الرابحين

# طباعة
print(f"عدد الخاسرين: {num_losers}")
print(f"عدد الرابحين: {num_winners}")


عدد الخاسرين: 31378
عدد الرابحين: 5715


In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


# XGBoosting

In [None]:
weight = num_losers / num_winners

# تحويل الأعمدة النصية إلى categorical
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        X_train[col] = X_train[col].astype('category')
        X_test[col] = X_test[col].astype('category')

        
# تدريب الموديل مع تفعيل enable_categorical=True
model = XGBClassifier(scale_pos_weight=weight, use_label_encoder=False, eval_metric='logloss', enable_categorical=True)
model.fit(X_train, y_train)

# التنبؤ
y_pred = model.predict(X_test)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6276
           1       1.00      0.98      0.99      1143

    accuracy                           1.00      7419
   macro avg       1.00      0.99      0.99      7419
weighted avg       1.00      1.00      1.00      7419

[[6276    0]
 [  20 1123]]


In [28]:
y_prob = model.predict_proba(X_test)[:, 1]  # احتمال أن يكون رابح

results_df = pd.DataFrame({
    'True_Label': y_test.values,
    'Predicted_Label': y_pred,
    #'Predicted_Probability': y_prob
})

results_df.to_csv("prediction_vs_truth_with_prob3.csv", index=False)
print("✅ تم حفظ الملف مع الاحتمالات")


✅ تم حفظ الملف مع الاحتمالات


In [29]:
# أخطاء: رابحة تنبأها خاسرة (False Negative)
false_negatives = results_df[(results_df['True_Label'] == 1) & (results_df['Predicted_Label'] == 0)]

# أخطاء: خاسرة تنبأها رابحة (False Positive)
false_positives = results_df[(results_df['True_Label'] == 0) & (results_df['Predicted_Label'] == 1)]

print(f"عدد False Negatives: {len(false_negatives)}")
print(false_negatives)

print(f"\nعدد False Positives: {len(false_positives)}")
print(false_positives)


عدد False Negatives: 20
      True_Label  Predicted_Label
193            1                0
1187           1                0
1633           1                0
2192           1                0
2627           1                0
3615           1                0
3874           1                0
4066           1                0
4237           1                0
4345           1                0
4362           1                0
4601           1                0
4782           1                0
5022           1                0
5327           1                0
6312           1                0
6574           1                0
6962           1                0
7282           1                0
7297           1                0

عدد False Positives: 0
Empty DataFrame
Columns: [True_Label, Predicted_Label]
Index: []


In [30]:
print(f"Total test samples: {len(y_test)}")
print(f"Total predicted labels: {len(y_pred)}")
print(f"Number of mismatches: {(y_test != y_pred).sum()}")


Total test samples: 7419
Total predicted labels: 7419
Number of mismatches: 20


# CatBoosting 
<h3> the best model ✅</h3>

In [None]:
cat_features = [
    col for col in X_train.columns 
    if str(X_train[col].dtype) in ['object', 'category']
]
cat_model = CatBoostClassifier(random_seed=42, verbose=0)
cat_model.fit(X_train, y_train, cat_features= cat_features)
y_pred = cat_model.predict(X_test)


In [None]:

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6276
           1       1.00      1.00      1.00      1143

    accuracy                           1.00      7419
   macro avg       1.00      1.00      1.00      7419
weighted avg       1.00      1.00      1.00      7419



In [34]:
print(f"Total test samples: {len(y_test)}")
print(f"Total predicted labels: {len(y_pred)}")
print(f"Number of mismatches: {(y_test != y_pred).sum()}")


Total test samples: 7419
Total predicted labels: 7419
Number of mismatches: 0


In [35]:
y_prob = cat_model.predict_proba(X_test)[:, 1]  # احتمال الربح

results_df = pd.DataFrame({
    'True_Label': y_test.values,
    'Predicted_Label': y_pred,
    #'Predicted_Probability': y_prob
})

# False Negatives: أفلام رابحة تنبأها خاسرة
false_negatives = results_df[(results_df['True_Label'] == 1) & (results_df['Predicted_Label'] == 0)]

# False Positives: أفلام خاسرة تنبأها رابحة
false_positives = results_df[(results_df['True_Label'] == 0) & (results_df['Predicted_Label'] == 1)]

print(f"عدد False Negatives: {len(false_negatives)}")
print(f"عدد False Positives: {len(false_positives)}")

# حفظ النتائج (اختياري)
results_df.to_csv("catboost_results.csv", index=False)


عدد False Negatives: 0
عدد False Positives: 0


In [38]:
example_row = df[:1].drop(columns=['profitable'])

example_row['budget'] = 0
example_row['runtime'] = 60
example_row['popularity'] = 10.1
example_row['vote_average'] = 7.5
example_row['vote_count'] = 4000000
example_row['release_year'] = 2024
example_row['release_month'] = 12
example_row['release_day'] = 25
example_row['tagline_sentiment'] = 1
example_row['actors_avg_rating'] = 6.8
example_row['actors_bayesian_rating'] = 7.1
example_row['actors_penalty_rating'] = 6.0
example_row['has_superstar_actor'] = False
example_row['movie_credits_oscar'] = 0
example_row['company_oscars'] = 0
prediction = cat_model.predict(example_row)
y_prob = cat_model.predict_proba(example_row)[:, 1]

print(" النتيجة:", "رابح " if prediction[0] == 1 else "خاسر ")
print(f" احتمالية الربح: {y_prob[0]*100:.2f}%")


 النتيجة: رابح 
 احتمالية الربح: 99.98%
