In [1]:
import logging
import os

import pandas as pd
import psycopg2
from startupradar.transformers.core import DomainTextTransformer, WhoisTransformer
from startupradar.transformers.util.api import StartupRadarAPI
import numpy as np

from utils import *

In [2]:
rating_threshold = 5
keeptop = 100

In [3]:
df = pd.read_parquet(".data/full.parquet")

# Check for missing values in each column
missing_values = df.isnull().sum()

for (col, val) in zip(missing_values.keys(), missing_values.values):
    if val > 0:
        print(f"Column {col} is missing {val} entries.")

# To find words that are correlated with highly-rated startups, 
# find most occuring words for highly-rated startups and discard 
# those words that also occur frequently for other startups.
df_filtered_good = df.loc[df['Rating'] >= rating_threshold]
words_good = []
for text in df_filtered_good.text:
    words_good += split_text(text)
word_counts_good = get_word_counts(words_good, keeptop=keeptop)

df_filtered_bad = df.loc[df['Rating'] < rating_threshold]
words_bad = []
for text in df_filtered_bad.text:
    words_bad += split_text(text)
word_counts_bad = get_word_counts(words_bad, keeptop=keeptop)

for word_good in word_counts_good.keys():
    if word_good not in word_counts_bad.keys():
        print(f"{word_good}")

Column created is missing 166 entries.
Column changed is missing 180 entries.
Column expires is missing 166 entries.
Column days_since_created is missing 166 entries.
Column days_since_changed is missing 180 entries.
Column e0 is missing 95 entries.
Column e1 is missing 95 entries.
Column e2 is missing 95 entries.
Column e3 is missing 95 entries.
Column e4 is missing 95 entries.
Column e5 is missing 95 entries.
Column e6 is missing 95 entries.
Column e7 is missing 95 entries.
Column e8 is missing 95 entries.
Column e9 is missing 95 entries.
Column e10 is missing 95 entries.
Column e11 is missing 95 entries.
Column e12 is missing 95 entries.
Column e13 is missing 95 entries.
Column e14 is missing 95 entries.
Column e15 is missing 95 entries.
Column e16 is missing 95 entries.
Column e17 is missing 95 entries.
Column e18 is missing 95 entries.
Column e19 is missing 95 entries.
Column e20 is missing 95 entries.
Column e21 is missing 95 entries.
Column e22 is missing 95 entries.
Column e23 

In [4]:
df

Unnamed: 0_level_0,Rating,created,changed,expires,days_since_created,days_since_changed,text,e0,e1,e2,...,e1526,e1527,e1528,e1529,e1530,e1531,e1532,e1533,e1534,e1535
Domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1komma5grad.com,2.0,2021-03-31,2022-12-04,2023-03-31,815.852575,202.852575,1KOMMA5° - Klimaneutrale Energiesysteme kaufen...,,,,...,,,,,,,,,,
2050-materials.com,4.0,2021-02-16,2023-02-17,2025-02-16,858.852575,127.852575,2050 Materials - Sustainability Data for Build...,0.005431,-0.002820,-0.016823,...,0.017510,0.012522,0.039410,-0.007863,-0.029448,-0.040398,0.000554,-0.001966,-0.018566,-0.022628
24ft.de,2.0,NaT,NaT,NaT,,,Temporäres Wohnen & Arbeiten in Containern | 2...,0.031842,0.004433,0.027297,...,-0.013292,0.009777,0.066854,-0.021602,-0.006173,0.003640,-0.004737,-0.005790,-0.016013,-0.037655
3dspark.de,2.0,NaT,NaT,NaT,,,Home - 3D Spark Home - 3D Spark Toggle navigat...,-0.007603,-0.017080,-0.019809,...,0.010292,0.008045,0.040895,-0.016510,-0.018601,-0.027440,0.008812,0.010034,-0.005859,-0.006290
42watt.de,4.0,NaT,NaT,NaT,,,"42watt - Energieberatung, Förderservice und Sa...",0.017485,-0.004283,-0.004433,...,0.019542,0.008723,0.049317,-0.019243,-0.029814,0.001512,-0.000394,0.002244,-0.021573,-0.026221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zbs-food.com,2.0,2019-05-08,2019-05-08,2023-05-08,1508.852575,1508.852575,Zero Bullshit - Gut für Dich & die Umwelt! Zer...,,,,...,,,,,,,,,,
zeg.ai,1.0,NaT,NaT,NaT,,,⭐ ZEG – Your Personal Virtual Studio ⭐ ZEG – Y...,-0.017512,-0.008878,-0.015928,...,0.013465,0.025604,0.013370,-0.023358,-0.028933,-0.000631,-0.025469,0.009081,-0.005789,-0.048421
zeroavia.com,1.0,2017-10-29,2022-10-22,2023-10-29,2064.852575,245.852575,First Practical Zero Emission Aviation Powertr...,0.002655,-0.023435,-0.000525,...,0.005417,0.014120,0.031612,-0.019192,-0.023903,-0.014602,0.001051,0.028052,-0.014254,-0.016061
zuluforest.com,3.0,2020-08-17,2022-08-18,2023-08-17,1041.852575,310.852575,Zulu Forest Sciences Zulu Forest Sciences Serv...,0.006494,-0.036504,-0.009214,...,0.020278,-0.002720,0.022006,0.005308,-0.015178,-0.015015,-0.017028,0.007541,-0.006654,-0.026072


In [5]:
embeddings = df.filter(regex=r'^e\d+')
ratings = df["Rating"][embeddings.notna().all(axis=1)]
embeddings = embeddings.dropna()

assert ratings.shape[0] == embeddings.shape[0]

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = [x.values for x in train_test_split(embeddings, ratings, train_size=0.7, stratify=ratings)]


In [8]:
from sklearn import linear_model
from sklearn import pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [9]:
regressor = linear_model.Ridge()

In [10]:
regressor.fit(X_train, y_train)

In [11]:
mean_squared_error(regressor.predict(X_test), y_test), mean_squared_error(regressor.predict(X_train), y_train)

(1.0031744689166764, 0.7212722675015154)

In [12]:
m = pipeline.Pipeline([("standardscaler", StandardScaler()), ("regressor", regressor)])

# m = pipeline.Pipeline([("regressor", regressor)]) # for ablation

In [13]:
m.fit(X_train, y_train)

In [14]:
mean_squared_error(m.predict(X_test), y_test), mean_squared_error(m.predict(X_train), y_train)

(1.7292273462029726, 0.011947893475815341)

In [15]:
from sklearn import model_selection

In [16]:
param_grid = {
    "regressor__alpha": [10**i for i in np.linspace(2, 4, 10)]
}

In [17]:
cv = model_selection.GridSearchCV(m, param_grid, cv=5, verbose=3, scoring="neg_mean_squared_error")

In [18]:
cv.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ...........regressor__alpha=100.0;, score=-1.390 total time=   0.1s
[CV 2/5] END ...........regressor__alpha=100.0;, score=-1.314 total time=   0.2s
[CV 3/5] END ...........regressor__alpha=100.0;, score=-1.325 total time=   0.2s
[CV 4/5] END ...........regressor__alpha=100.0;, score=-1.443 total time=   0.1s
[CV 5/5] END ...........regressor__alpha=100.0;, score=-1.293 total time=   0.1s


[CV 1/5] END regressor__alpha=166.81005372000593;, score=-1.300 total time=   0.2s
[CV 2/5] END regressor__alpha=166.81005372000593;, score=-1.217 total time=   0.1s
[CV 3/5] END regressor__alpha=166.81005372000593;, score=-1.247 total time=   0.1s
[CV 4/5] END regressor__alpha=166.81005372000593;, score=-1.344 total time=   0.1s
[CV 5/5] END regressor__alpha=166.81005372000593;, score=-1.183 total time=   0.0s
[CV 1/5] END regressor__alpha=278.2559402207126;, score=-1.210 total time=   0.0s
[CV 2/5] END regressor__alpha=278.2559402207126;, score=-1.120 total time=   0.0s
[CV 3/5] END regressor__alpha=278.2559402207126;, score=-1.174 total time=   0.0s
[CV 4/5] END regressor__alpha=278.2559402207126;, score=-1.246 total time=   0.0s
[CV 5/5] END regressor__alpha=278.2559402207126;, score=-1.080 total time=   0.0s
[CV 1/5] END regressor__alpha=464.15888336127773;, score=-1.130 total time=   0.1s
[CV 2/5] END regressor__alpha=464.15888336127773;, score=-1.037 total time=   0.1s
[CV 3/5] 

In [19]:
list(zip(cv.cv_results_["params"], cv.cv_results_["mean_test_score"]))

[({'regressor__alpha': 100.0}, -1.3529300163498907),
 ({'regressor__alpha': 166.81005372000593}, -1.2581189513410374),
 ({'regressor__alpha': 278.2559402207126}, -1.1660504786113344),
 ({'regressor__alpha': 464.15888336127773}, -1.086129003089098),
 ({'regressor__alpha': 774.263682681127}, -1.0247176769359208),
 ({'regressor__alpha': 1291.549665014884}, -0.9847423673605735),
 ({'regressor__alpha': 2154.4346900318824}, -0.9668213315038189),
 ({'regressor__alpha': 3593.813663804626}, -0.9706512308093014),
 ({'regressor__alpha': 5994.8425031894085}, -0.9957688035095824),
 ({'regressor__alpha': 10000.0}, -1.041642100832177)]

In [20]:
predictions_train = cv.best_estimator_.predict(X_train)
predictions_test = cv.best_estimator_.predict(X_test)

In [21]:
mean_squared_error(predictions_train, y_train), mean_squared_error(predictions_test, y_test)

(0.49814566481536604, 0.9934220455234609)