# Company Market Cap Prediction
### Given data about big companies, let's try to predict the market capitalization of a given company.

### we will use a variety of regression models to make our predictions.

In [26]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# testing
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor



In [27]:
data=pd.read_csv('Fortune_1000.csv')

In [28]:
data

Unnamed: 0,company,rank,rank_change,revenue,profit,num. of employees,sector,city,state,newcomer,ceo_founder,ceo_woman,profitable,prev_rank,CEO,Website,Ticker,Market Cap
0,Walmart,1,0.0,523964.0,14881.0,2200000,Retailing,Bentonville,AR,no,no,no,yes,1.0,C. Douglas McMillon,https://www.stock.walmart.com,WMT,411690
1,Amazon,2,3.0,280522.0,11588.0,798000,Retailing,Seattle,WA,no,yes,no,yes,5.0,Jeffrey P. Bezos,https://www.amazon.com,AMZN,1637405
2,Exxon Mobil,3,-1.0,264938.0,14340.0,74900,Energy,Irving,TX,no,no,no,yes,2.0,Darren W. Woods,https://www.exxonmobil.com,XOM,177923
3,Apple,4,-1.0,260174.0,55256.0,137000,Technology,Cupertino,CA,no,no,no,yes,3.0,Timothy D. Cook,https://www.apple.com,AAPL,2221176
4,CVS Health,5,3.0,256776.0,6634.0,290000,Health Care,Woonsocket,RI,no,no,yes,yes,8.0,Karen S. Lynch,https://www.cvshealth.com,CVS,98496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Mr. Cooper Group,996,0.0,2007.0,274.0,9100,Financials,Coppell,TX,,no,no,yes,,Jay Bray,https://mrcoopergroup.com,COOP,674.1
996,Herc Holdings,997,0.0,1999.0,47.5,5100,Business Services,Bonita Springs,FL,,no,no,yes,,Lawrence H. Silber,https://www.hercrentals.com,HRI,590.5
997,Healthpeak Properties,998,0.0,1997.4,45.5,204,Financials,Irvine,CA,,no,no,yes,,Thomas M. Herzog,https://www.hcpi.com,PEAK,12059.3
998,SPX FLOW,999,0.0,1996.3,-95.1,5000,Industrials,Charlotte,NC,,no,no,no,,Marcus G. Michael,https://www.spxflow.com,FLOW,1211.8


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   company            1000 non-null   object 
 1   rank               1000 non-null   int64  
 2   rank_change        1000 non-null   float64
 3   revenue            1000 non-null   float64
 4   profit             998 non-null    float64
 5   num. of employees  1000 non-null   int64  
 6   sector             1000 non-null   object 
 7   city               1000 non-null   object 
 8   state              1000 non-null   object 
 9   newcomer           500 non-null    object 
 10  ceo_founder        1000 non-null   object 
 11  ceo_woman          1000 non-null   object 
 12  profitable         1000 non-null   object 
 13  prev_rank          1000 non-null   object 
 14  CEO                992 non-null    object 
 15  Website            1000 non-null   object 
 16  Ticker             938 no

In [30]:
# data['Market Cap']=data['Market Cap'].astype(np.float)
# data['Market Cap']=data['Market Cap'].replace('-',np.NaN).isna()
data[data['Market Cap'].isna()].index

Int64Index([ 35,  73,  76, 104, 231, 237, 253, 254, 268, 299, 306, 318, 319,
            367, 375, 400, 446, 454, 466, 631, 655, 666, 669, 684, 694, 709,
            714, 716, 721, 748, 777, 781, 822, 825, 836, 862, 865, 894, 907,
            958],
           dtype='int64')

# Prepocessing

In [31]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop unused columns
    df = df.drop(['rank', 'rank_change', 'company', 'newcomer', 'prev_rank', 'CEO', 'Website', 'Ticker'], axis=1)
    
    # Encode missing values
    df['Market Cap'] = df['Market Cap'].replace('-', np.NaN).astype(np.float)
    
    # Drop missing target rows
    missing_target_rows = df[df['Market Cap'].isna()].index
    df = df.drop(missing_target_rows, axis=0).reset_index(drop=True)
    
    # Fill remaining missing values
    df['profit'] = df['profit'].fillna(df['profit'].mean())
     
    # Binary encoding
    for column in ['ceo_founder', 'ceo_woman', 'profitable']:
        df[column] = df[column].replace({'no': 0, 'yes': 1})
    
    # One-hot encoding
    for column in ['sector', 'city', 'state']:
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    
    # Split df into X and y
    y = df['Market Cap']
    x = df.drop('Market Cap', axis=1)
    
    # Train-test split
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, shuffle=True, random_state=1)
    
#     # Scale X
    scaler = StandardScaler()
    scaler.fit(x_train)
    x_train = pd.DataFrame(scaler.transform(x_train), index=x_train.index, columns=x_train.columns)
    x_test = pd.DataFrame(scaler.transform(x_test), index=x_test.index, columns=x_test.columns)
    
    return x_train, x_test, y_train, y_test

In [32]:
x_train,x_test,y_train,y_test=preprocessing_inputs(data)

In [33]:
x_train

Unnamed: 0,revenue,profit,num. of employees,ceo_founder,ceo_woman,profitable,sector_Aerospace & Defense,sector_Apparel,sector_Business Services,sector_Chemicals,...,state_PA,state_PR,state_RI,state_SC,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI
318,-0.182121,-0.030031,-0.172549,-0.228506,-0.249600,0.413249,-0.161971,-0.12969,4.179979,-0.17609,...,-0.209657,-0.038808,-0.038808,-0.038808,-0.141204,-0.356537,-0.038808,-0.197642,-0.12356,-0.157014
57,1.079306,0.501438,-0.231484,-0.228506,-0.249600,0.413249,-0.161971,-0.12969,-0.239236,-0.17609,...,-0.209657,-0.038808,-0.038808,-0.038808,-0.141204,2.804758,-0.038808,-0.197642,-0.12356,-0.157014
817,-0.372868,-0.252795,-0.276585,-0.228506,4.006405,0.413249,-0.161971,-0.12969,-0.239236,-0.17609,...,-0.209657,-0.038808,-0.038808,-0.038808,-0.141204,-0.356537,-0.038808,-0.197642,-0.12356,-0.157014
708,-0.355584,-0.225409,-0.343187,-0.228506,-0.249600,0.413249,-0.161971,-0.12969,-0.239236,-0.17609,...,-0.209657,-0.038808,-0.038808,-0.038808,-0.141204,2.804758,-0.038808,-0.197642,-0.12356,-0.157014
826,-0.374988,-0.370936,-0.285958,-0.228506,-0.249600,-2.419849,-0.161971,-0.12969,-0.239236,-0.17609,...,-0.209657,-0.038808,-0.038808,-0.038808,-0.141204,-0.356537,-0.038808,-0.197642,-0.12356,-0.157014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,-0.364224,-0.240179,-0.315566,-0.228506,4.006405,0.413249,-0.161971,-0.12969,-0.239236,-0.17609,...,-0.209657,-0.038808,-0.038808,-0.038808,-0.141204,-0.356537,-0.038808,-0.197642,-0.12356,-0.157014
72,0.771302,0.373296,0.548207,-0.228506,-0.249600,0.413249,-0.161971,-0.12969,-0.239236,-0.17609,...,-0.209657,-0.038808,-0.038808,-0.038808,-0.141204,-0.356537,-0.038808,-0.197642,-0.12356,-0.157014
908,-0.384852,-0.369881,-0.295331,-0.228506,-0.249600,-2.419849,-0.161971,-0.12969,-0.239236,-0.17609,...,-0.209657,-0.038808,-0.038808,-0.038808,-0.141204,-0.356537,-0.038808,-0.197642,-0.12356,-0.157014
235,-0.082339,0.162313,0.323263,-0.228506,-0.249600,0.413249,-0.161971,-0.12969,-0.239236,-0.17609,...,-0.209657,-0.038808,-0.038808,-0.038808,-0.141204,-0.356537,-0.038808,-0.197642,-0.12356,-0.157014


In [34]:
y_train

318     12557.0
57      17648.0
817      1311.2
708       307.6
826      1640.2
         ...   
767      4698.7
72       9114.0
908      1307.6
235     15366.0
37     104549.0
Name: Market Cap, Length: 665, dtype: float64

# Training

In [35]:
models = {
    "     Linear Regression": LinearRegression(),
    "Linear Regression (L2)": Ridge(),
    "Linear Regression (L1)": Lasso(),
    "         Decision Tree": DecisionTreeRegressor(),
    "        Neural Network": MLPRegressor(),
    "         Random Forest": RandomForestRegressor(),
    "     Gradient Boosting": GradientBoostingRegressor()
}
for names,model in models.items():
    model.fit(x_train,y_train)
    print(names+" Trained.")

     Linear Regression Trained.
Linear Regression (L2) Trained.


  model = cd_fast.enet_coordinate_descent(


Linear Regression (L1) Trained.
         Decision Tree Trained.




        Neural Network Trained.
         Random Forest Trained.
     Gradient Boosting Trained.


# Testing

In [23]:
for names,model in models.items():
    y_pred=model.predict(x_test)
    rsme=np.sqrt(np.mean((y_test-y_pred)**2))
    print(names +" RSME: {:.2f}".format(rsme))

     Linear Regression RSME: 14156700706724677632.00
Linear Regression (L2) RSME: 150287.44
Linear Regression (L1) RSME: 148364.67
         Decision Tree RSME: 131565.65
        Neural Network RSME: 149367.90
         Random Forest RSME: 87931.71
     Gradient Boosting RSME: 94610.47


In [24]:
for names,model in models.items():
    r2=model.score(x_test,y_test)
    print(names+" R^2 Value:{:.5f}".format(r2))
    


     Linear Regression R^2 Value:-9499630675145856843662753792.00000
Linear Regression (L2) R^2 Value:-0.07060
Linear Regression (L1) R^2 Value:-0.04338
         Decision Tree R^2 Value:0.17952
        Neural Network R^2 Value:-0.05754
         Random Forest R^2 Value:0.63350
     Gradient Boosting R^2 Value:0.57571


### The best Prediction we will get from Random Forest which has Minimum RMSE and Maximum R^2 value 
