# Preprocessing Basic

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier


In [3]:
file_path = '../data/startup data.csv'
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"ERROR: File not found at '{file_path}'.")
    print("Please ensure you have downloaded 'startup_data.csv' from Kaggle and placed it in the same directory.")
except Exception as e:
    print(f"An error occurred during loading: {e}")

In [4]:
df

Unnamed: 0.1,Unnamed: 0,state_code,latitude,longitude,zip_code,id,city,Unnamed: 6,name,labels,...,object_id,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status
0,1005,CA,42.358880,-71.056820,92101,c:6669,San Diego,,Bandsintown,1,...,c:6669,0,1,0,0,0,0,1.0000,0,acquired
1,204,CA,37.238916,-121.973718,95032,c:16283,Los Gatos,,TriCipher,1,...,c:16283,1,0,0,1,1,1,4.7500,1,acquired
2,1001,CA,32.901049,-117.192656,92121,c:65620,San Diego,San Diego CA 92121,Plixi,1,...,c:65620,0,0,1,0,0,0,4.0000,1,acquired
3,738,CA,37.320309,-122.050040,95014,c:42668,Cupertino,Cupertino CA 95014,Solidcore Systems,1,...,c:42668,0,0,0,1,1,1,3.3333,1,acquired
4,1002,CA,37.779281,-122.419236,94105,c:65806,San Francisco,San Francisco CA 94105,Inhale Digital,0,...,c:65806,1,1,0,0,0,0,1.0000,1,closed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
918,352,CA,37.740594,-122.376471,94107,c:21343,San Francisco,,CoTweet,1,...,c:21343,0,0,1,0,0,0,6.0000,1,acquired
919,721,MA,42.504817,-71.195611,1803,c:41747,Burlington,Burlington MA 1803,Reef Point Systems,0,...,c:41747,1,0,0,1,0,0,2.6667,1,closed
920,557,CA,37.408261,-122.015920,94089,c:31549,Sunnyvale,,Paracor Medical,0,...,c:31549,0,0,0,0,0,1,8.0000,1,closed
921,589,CA,37.556732,-122.288378,94404,c:33198,San Francisco,,Causata,1,...,c:33198,0,0,1,1,0,0,1.0000,1,acquired


In [None]:
# We will test the models accuracy with minimal preprocessing
# Dropping id columns and 
columns_to_drop = ['Unnamed: 0','id','Unnamed: 6','age_first_milestone_year',
       'age_last_milestone_year', 'status', 'state_code.1', 'closed_at', ]
df.drop(columns=columns_to_drop, inplace= True)

In [8]:
df.columns

Index(['state_code', 'latitude', 'longitude', 'zip_code', 'city', 'name',
       'labels', 'founded_at', 'first_funding_at', 'last_funding_at',
       'age_first_funding_year', 'age_last_funding_year', 'relationships',
       'funding_rounds', 'funding_total_usd', 'milestones', 'is_CA', 'is_NY',
       'is_MA', 'is_TX', 'is_otherstate', 'category_code', 'is_software',
       'is_web', 'is_mobile', 'is_enterprise', 'is_advertising',
       'is_gamesvideo', 'is_ecommerce', 'is_biotech', 'is_consulting',
       'is_othercategory', 'object_id', 'has_VC', 'has_angel', 'has_roundA',
       'has_roundB', 'has_roundC', 'has_roundD', 'avg_participants',
       'is_top500'],
      dtype='object')

In [12]:
target_column = 'labels'
X = df.drop(columns=[target_column])
y = df[target_column]

le = LabelEncoder()

for col in X.columns:
    if X[col].dtype == 'object' or X[col].dtype.name == 'category':
        X[col] = le.fit_transform(X[col].astype(str))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y,
)

classifiers = {
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier()
}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"\n=========== {name} ===========")
    print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.75      0.65      0.69        65
           1       0.82      0.88      0.85       120

    accuracy                           0.80       185
   macro avg       0.79      0.76      0.77       185
weighted avg       0.80      0.80      0.80       185


              precision    recall  f1-score   support

           0       0.59      0.20      0.30        65
           1       0.68      0.93      0.78       120

    accuracy                           0.67       185
   macro avg       0.64      0.56      0.54       185
weighted avg       0.65      0.67      0.61       185


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        65
           1       0.65      1.00      0.79       120

    accuracy                           0.65       185
   macro avg       0.32      0.50      0.39       185
weighted avg       0.42      0.65      0.51       185


              p

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# Preprocessing level 2

In [13]:
# We will test the models accuracy with minimal preprocessing
# Dropping id columns and 
columns_to_drop = ['latitude', 'longitude', 'is_CA', 'is_NY',
       'is_MA', 'is_TX', 'is_otherstate', 'is_software',
       'is_web', 'is_mobile', 'is_enterprise', 'is_advertising',
       'is_gamesvideo', 'is_ecommerce', 'is_biotech', 'is_consulting',
       'is_othercategory', 'object_id',]
df.drop(columns=columns_to_drop, inplace= True)

In [15]:
target_column = 'labels'
X = df.drop(columns=[target_column])
y = df[target_column]

le = LabelEncoder()

for col in X.columns:
    if X[col].dtype == 'object' or X[col].dtype.name == 'category':
        X[col] = le.fit_transform(X[col].astype(str))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y,
)

classifiers = {
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier()
}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"\n=========== {name} ===========")
    print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.76      0.65      0.70        65
           1       0.82      0.89      0.86       120

    accuracy                           0.81       185
   macro avg       0.79      0.77      0.78       185
weighted avg       0.80      0.81      0.80       185


              precision    recall  f1-score   support

           0       0.53      0.40      0.46        65
           1       0.71      0.81      0.76       120

    accuracy                           0.66       185
   macro avg       0.62      0.60      0.61       185
weighted avg       0.65      0.66      0.65       185


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        65
           1       0.65      1.00      0.79       120

    accuracy                           0.65       185
   macro avg       0.32      0.50      0.39       185
weighted avg       0.42      0.65      0.51       185


              p

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [34]:
df.columns

Index(['state_code', 'zip_code', 'city', 'name', 'labels', 'founded_at',
       'first_funding_at', 'last_funding_at', 'age_first_funding_year',
       'age_last_funding_year', 'relationships', 'funding_rounds',
       'funding_total_usd', 'milestones', 'category_code', 'has_VC',
       'has_angel', 'has_roundA', 'has_roundB', 'has_roundC', 'has_roundD',
       'avg_participants', 'is_top500', 'has_RoundABCD', 'has_Investor',
       'has_Seed'],
      dtype='object')

In [49]:
df['has_RoundABCD'] = np.where((df['has_roundA'] == 1) | (df['has_roundB'] == 1) | (df['has_roundC'] == 1) | (df['has_roundD'] == 1), 1, 0)
df['has_Investor'] = np.where((df['has_VC'] == 1) | (df['has_angel'] == 1), 1, 0)
df['has_Seed'] = np.where((df['has_RoundABCD'] == 0) & (df['has_Investor'] == 1), 1, 0)


# ------------------------------
# 1. Split X, y
# ------------------------------
target_column = 'labels'
X = df.drop(columns=[target_column, 'founded_at', 'name', 'has_roundA', 'has_roundB', 'has_roundC', 'has_VC',
       'has_angel',], )
y = df[target_column]

# -----------------------------------------------------
# 2. Identify categorical + numeric columns
# -----------------------------------------------------
cat_cols = X.select_dtypes(include=["object", "category"]).columns
num_cols = X.select_dtypes(include=["int64", "float64"]).columns

# -----------------------------------------------------
# 3. Preprocessing: OneHotEncode categorical + pass-through numeric
# -----------------------------------------------------
preprocess_no_scaling = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

preprocess_with_scaling = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols),
    ]
)

# -----------------------------------------------------
# 4. Classifiers (pipelines)
# -----------------------------------------------------
classifiers = {
    "Random Forest": Pipeline([
        ("prep", preprocess_no_scaling),
        ("clf", RandomForestClassifier(max_depth= None, min_samples_split= 2))
    ]),

    "Decision Tree": Pipeline([
        ("prep", preprocess_no_scaling),
        ("clf", DecisionTreeClassifier())
    ]),

    "Logistic Regression": Pipeline([
        ("prep", preprocess_with_scaling),
        ("clf", LogisticRegression(max_iter=2000))
    ]),

    "SVM": Pipeline([
        ("prep", preprocess_with_scaling),
        ("clf", SVC())
    ]),

    "Neural Network (MLPClassifier)": Pipeline([
        ("prep", preprocess_with_scaling),
        ("clf", MLPClassifier(
            hidden_layer_sizes=(64, 32),
            activation="relu",
            solver="adam",
            max_iter=500,
            random_state=42
        ))
    ]),

    "XGBoost": Pipeline([
        ("prep", preprocess_no_scaling),
        ("clf", XGBClassifier(
            n_estimators=400,
            max_depth=8,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="binary:logistic" if len(set(y)) == 2 else "multi:softprob",
            eval_metric="logloss",
            random_state=42
        ))
    ])
}

# -----------------------------------------------------
# 5. Train-test split
# -----------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------------------------------
# 6. Train & evaluate
# -----------------------------------------------------
for name, model in classifiers.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n=========== {name} ===========")
    print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.76      0.60      0.67        65
           1       0.81      0.90      0.85       120

    accuracy                           0.79       185
   macro avg       0.79      0.75      0.76       185
weighted avg       0.79      0.79      0.79       185


              precision    recall  f1-score   support

           0       0.63      0.60      0.61        65
           1       0.79      0.81      0.80       120

    accuracy                           0.74       185
   macro avg       0.71      0.70      0.71       185
weighted avg       0.73      0.74      0.73       185


              precision    recall  f1-score   support

           0       0.63      0.60      0.61        65
           1       0.79      0.81      0.80       120

    accuracy                           0.74       185
   macro avg       0.71      0.70      0.71       185
weighted avg       0.73      0.74      0.73       185


              p

In [47]:
df.to_csv('../data/startup_data_processed.csv', index=False)