In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_absolute_error

import xgboost as xgb

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [3]:
from google.colab import drive
drive.mount('/content/drive')
root_path = '/content/drive/MyDrive/Projects/StartupValuation/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
startup_df = pd.read_csv(root_path + 'dataset.csv')

In [5]:
print(startup_df.columns)

Index(['id', 'category_code', 'status', 'founded_at', 'logo_width',
       'logo_height', 'country_code', 'investment_rounds',
       'invested_companies', 'funding_rounds', 'funding_total_usd',
       'milestones', 'relationships', 'object_id', 'angel', 'crowdfunding',
       'other', 'post_ipo', 'private_equity', 'series_a', 'series_b',
       'series_c', 'venture', 'acquiring_object_id',
       'num_acquisizioni_effettuate', 'acquired_object_id',
       'have_been_acquired', 'funded_object_id_x', 'fin_org_financed',
       'funded_object_id_y', 'person_financed', 'funded_object_id',
       'startup_financed', 'parent_id', 'num_prodotti'],
      dtype='object')


In [None]:
# Data cleaning
startup_df['founded_at'] = pd.to_datetime(startup_df['founded_at'], errors='coerce')
startup_df.fillna(0, inplace=True)

# Feature engineering: log transform of funding amounts
startup_df['log_funding_total_usd'] = np.log1p(startup_df['funding_total_usd'])

# Define target variable (e.g., successful if 'status' is 'operating' or 'acquired')
startup_df['successful'] = startup_df['status'].apply(lambda x: 1 if x in ['operating', 'acquired'] else 0)

# One-hot encode categorical features
categorical_features = ['category_code', 'country_code', 'object_id', 'acquiring_object_id',
                        'acquired_object_id', 'funded_object_id_x', 'funded_object_id_y',
                        'funded_object_id', 'parent_id']
df_encoded = pd.get_dummies(startup_df, columns=categorical_features)

In [None]:
# Split data
X = df_encoded.drop(columns=['id', 'founded_at', 'status', 'successful'])
y = df_encoded['successful']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate model
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

In [None]:
# Define the neural network
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy}')