In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from prophet import Prophet

For this work we will need our 3 different sources of data:

*   research from the field labeled on topics
*   patents from the field labeled on topics
*   overall financial data per country


The topics were labeled using CLAUDE AI, and verified/shrinked by a **quantum expert** to only represent meaningful information and no repetition.


In [61]:
patents = pd.read_csv("patents_labeled.csv")
research = pd.read_csv("cleaned_final_data.csv")
financial = pd.read_csv("quantum_funding_with_all_countries.csv", sep=";")

In [62]:
print("Patents columns:", patents.columns)
print("Research columns:", research.columns)
print("Financial columns:", financial.columns)

Patents columns: Index(['#', 'Publication Year', 'Title', 'Abstract', 'Applicants', 'Inventors',
       'Country', 'Label'],
      dtype='object')
Research columns: Index(['Authors', 'Title', 'Year', 'Cited by', 'Affiliations',
       'Authors with affiliations', 'Abstract', 'Author_list', 'Country',
       'Label'],
      dtype='object')
Financial columns: Index(['year', 'Armenia', 'Austria', 'Belgium', 'Bulgaria', 'Switzerland',
       'Cyprus', 'Czech Republic', 'Germany', 'Denmark', 'Estonia', 'Greece',
       'Spain', 'Finland', 'France', 'Croatia', 'Hungary', 'Ireland', 'Israel',
       'Italy', 'Lithuania', 'Luxembourg', 'Latvia', 'Netherlands', 'Norway',
       'Poland', 'Portugal', 'Romania', 'Serbia', 'Sweden', 'Slovenia',
       'Slovakia', 'Turkey', 'Ukraine', 'United Kingdom', 'USA', 'Canada',
       'Japan', 'China', 'South Korea', 'India', 'Australia', 'Singapore'],
      dtype='object')


We see that the **"Year"** collumn is not consitent for all the 3 different datasets we have. We will need first to solve this so we can aggregate data per year.

In [63]:
patents.rename(columns={"Publication Year": "Year"}, inplace=True)

For the financial data, things will be kept simple and we aggregate all the investted money per year, no matter the subfield of quantum.

In [64]:
#initially we have the year, and the collumns representing one country and entiry for that row, the amount of money invested from that country
#we chnage this to have the amount as a separate collumn, country as only one collumn and the entires are the actual countries
#it will look like year, country and count (value)
financial_long = financial.melt(id_vars="year", var_name="Country", value_name="Count")

#we just convert this to numeric in case it wasnt already  - this dataset was made by Irene
financial_long["Count"] = pd.to_numeric(financial_long["Count"], errors="coerce")

#as said we just group based per each year all investments from all countries
financial_grouped = financial_long.groupby("year", as_index=False)["Count"].sum()
#we just rename the collumns for clarity
financial_grouped.rename(columns={"year": "Year", "Count": "Financial"}, inplace=True)

In [167]:
print(financial_grouped[financial_grouped["Year"] == 2017])
#just to check ifall worked out


   Year  Financial  Financial_normalized
3  2017       6268              0.257953


Now we need to count all number of patents **per year**, **per specific label** so we can use it in the final combined formula. The idea will be that user can select his preferences based on the subtopic he wants to explore.

In [170]:
def count_per_label_per_year(df):
    return df.groupby(["Year", "Label"]).size().reset_index(name="Count")
#the number of rows for each (Year, Label) pair so we count how many patents for each label.
#this can be used as well for the research papers dataset


#we just pply the same function because the initial data is organised the same
patent_counts = count_per_label_per_year(patents)
research_counts = count_per_label_per_year(research)


The next step is to  **normlaize the data **  in the financial part so it lies between 0 and 1 rather than huge numbers.

In [171]:
#copy
financial_grouped = financial_grouped.copy()

scaler = MinMaxScaler()
#we take the new grouoped financial data and we normalize creating a new collumn
financial_grouped["Financial_normalized"] = scaler.fit_transform(financial_grouped[["Financial"]])


Now we will combine/merge the patent coutns per year and the research counts based on **year and label** and then merge this with the financial data based on year. We will left join so we keep all rows already existed from the first step I mwntioned.

In [172]:
combined = pd.merge(patent_counts, research_counts, on=["Year", "Label"], how="outer", suffixes=("_patents", "_research"))
combined = pd.merge(combined, financial_grouped[["Year", "Financial_normalized"]], on="Year", how="left")

#note here why left join and outer join
#outher will keep all values we join on


combined.fillna(0, inplace=True)

#SAME NORMALIZATION FOR PATENTES AND RESEARCH TO MAKE THEM ON THE SAME SCALE.
scaler = MinMaxScaler()
combined[["Count_patents", "Count_research"]] = scaler.fit_transform(
    combined[["Count_patents", "Count_research"]]
)

#we just rename back to financial to not keep any confusion
combined.rename(columns={"Financial_normalized": "Financial"}, inplace=True)

#IMPORTANT
#WE WILL COMPUTE THE SCORES WITH THIS FORMULA BUT LATER IN THE APP THIS CAN BE PERSONALIZED.
#HOWEVER I STRONLGY RECOMMED TO USE THIS, BECAUSE IT WAS ANALYZSED BASED ON HOW MUCH DATA IS AVALIABLE FOR EACH SECTION

combined["WeightedScore"] = (0.55 * combined["Count_patents"] +0.35 * combined["Count_research"] +0.1 * combined["Financial"])


In [94]:
#from final data we just remove this 2 labels
combined = combined[~combined["Label"].isin(["error", "invalid_label"])].copy()

invalid_labels = ["error", "invalid_label"]
patents = patents[~patents["Label"].isin(invalid_labels)].copy()
research = research[~research["Label"].isin(invalid_labels)].copy()


In [173]:
combined.head(25)

Unnamed: 0,Year,Label,Count_patents,Count_research,Financial,WeightedScore
0,2017,quantum algorithms,0.029903,0.050718,0.257953,0.059993
1,2017,quantum coherence,0.014604,0.038278,0.257953,0.047224
2,2017,quantum communication,0.047288,0.02201,0.257953,0.059507
3,2017,quantum compilation,0.004868,0.006699,0.257953,0.030817
4,2017,quantum computational complexity,0.0,0.005742,0.257953,0.027805
5,2017,quantum computing,0.189152,0.211483,0.257953,0.203848
6,2017,quantum computing applications,0.223922,0.039234,0.257953,0.162685
7,2017,quantum computing foundations,0.0,0.004785,0.257953,0.02747
8,2017,quantum computing hardware (hybrid),0.106398,0.044019,0.257953,0.099721
9,2017,quantum computing theory,0.006259,0.152153,0.257953,0.082491


I showed this to clearly see why I will weight patnets more than research. Because for research less data is available and sometimes the normalized value is 0 for some labels.

Next I will check the total counts of each label so I can get some personal insight regarding custom models depending on data availability - previous research we made clearly showed a need for this.

In [140]:
patent_counts_per_label = patents["Label"].value_counts().reset_index()
#we count how many times each label appears
patent_counts_per_label.columns = ["Label", "Patent_Count"]

#we do the same for research papers
research_counts_per_label = research["Label"].value_counts().reset_index()
research_counts_per_label.columns = ["Label", "Research_Count"]

#now we just merge and fill any missing vlaues in case they are
label_counts = pd.merge(patent_counts_per_label, research_counts_per_label, on="Label", how="outer").fillna(0)
label_counts["Patent_Count"] = label_counts["Patent_Count"].astype(int)
label_counts["Research_Count"] = label_counts["Research_Count"].astype(int)
label_counts["Total_Count"] = label_counts["Patent_Count"] + label_counts["Research_Count"]

label_counts = label_counts.sort_values("Total_Count", ascending=False).reset_index(drop=True)

print(label_counts.to_string(index=False))

                              Label  Patent_Count  Research_Count  Total_Count
                  quantum computing          6216            4707        10923
     quantum computing applications          5256            2610         7866
               quantum optimization          4495            2035         6530
                   quantum hardware          4636             420         5056
               quantum cryptography          2948            1820         4768
quantum computing hardware (hybrid)          3531            1020         4551
                    quantum sensing          3685             438         4123
             quantum security tools          2721             783         3504
                 quantum algorithms          1056            1724         2780
                    quantum control          2135             551         2686
           quantum machine learning          1399            1284         2683
           quantum error correction           810   

In [71]:
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")
import numpy as np

I will hard code this rules to define which order of the model we need to use.

I did some tesing before and order 5 gave best results to maxiumum quantity of data. To less data even order 1 (linear regresion model) can obtain smallest RMSE on validation set (last 2 yers of data).

In [141]:
def determine_degree(total_count):
    if total_count >= 8000:
        return 5
    elif total_count >= 2000:
        return 3
    elif total_count >= 1000:
        return 2
    else:
        return 1

In [100]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures

Now the next step is to prepare the training. We apply our function from before to determine firest which order of the model we want to use.

In terms of training first we will use all data from 2017-2022 for training and 2023 2024 for **validation**.

Latest years of data are more menanigful and represent also a higher curve so it will be smart to actually use all data available for training again, after we check on validation which model to pick.

Why we used ridge regularization is explained and detailed in the previous research we made - but i n general to avoid overfitting because we dont have much data, but in the same time it s enough.

In [174]:
def test(label_name, alpha, optimism_factor):

    row = label_counts[label_counts["Label"] == label_name]
    if row.empty:
        print(f"Label not found: {label_name}")
        return
    total_count = row["Total_Count"].values[0]
    degree = determine_degree(total_count)
    #with this we just establish the degree we use for the polinomial

    df = combined[combined["Label"] == label_name][["Year", "WeightedScore"]].copy()
    df = df[(df["Year"] >= 2017) & (df["Year"] <= 2024)]

    #we do the split as discuseed before
    train_df = df[df["Year"] <= 2022]
    val_df = df[df["Year"] > 2022]

    X_train = train_df["Year"].values.reshape(-1, 1)
    y_train = train_df["WeightedScore"].values
    X_val = val_df["Year"].values.reshape(-1, 1)
    y_val = val_df["WeightedScore"].values

    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    X_val_poly = poly.transform(X_val)

    year_weights = np.linspace(1.0, 2.0, len(y_train))
    model = Ridge(alpha=alpha)
    #we apply the reglularization
    model.fit(X_train_poly, y_train, sample_weight=year_weights)
    val_pred = model.predict(X_val_poly)

    rmse = mean_squared_error(y_val, val_pred) ** 0.5


#we keep those only now fvor testing and exploration in the app gthose will be behind the hood
    print(f"\nLabel: {label_name} (Degree={degree})")
    print(f" Validation RMSE (2023–2024): {rmse:.4f}")
    print("True vs Predicted (Validation):")
    for year, true, pred in zip(X_val.flatten(), y_val, val_pred):
        print(f"   {year}: true={true:.4f} | pred={pred:.4f}")

    #we retrain on the full data
    X_full = df["Year"].values.reshape(-1, 1)
    y_full = df["WeightedScore"].values
    X_full_poly = poly.fit_transform(X_full)

    full_weights = np.linspace(1.0, 2.0, len(y_full))
    #this gives more importance to recent years
    model.fit(X_full_poly, y_full, sample_weight=full_weights)

    #we make our predictions
    X_future = np.arange(2025, 2029).reshape(-1, 1)
    X_future_poly = poly.transform(X_future)
    future_preds = model.predict(X_future_poly)

    #because I ve seen the model understimate a bit all the predictions on the val set.
    #by looking at the quantum market from the past this is not so representable
    #so i will artifically boost by 5% the predicted values
    future_preds = future_preds * optimism_factor

   #here we will probably append al data not ust 2024
    last_val_2024 = df[df["Year"] == 2024]["WeightedScore"].values[0]
    all_years = np.append([2024], X_future.flatten())
    all_scores = np.append([last_val_2024], future_preds)


    print("\Future Predictions & % Growth vs 2024:")
    baseline_2024 = all_scores[0]
    for i in range(1, len(all_years)):
       curr_score = all_scores[i]
       growth = ((curr_score - baseline_2024) / baseline_2024) * 100 if baseline_2024 != 0 else np.nan
       print(f"   {all_years[i]}: score={curr_score:.4f} | growth vs 2024={growth:.2f}%")



Now we imput the weight and our hyperparameter from the **RIDGE REGULARIZATION** for testing wise.

In [175]:
test("quantum error correction", alpha=0.1, optimism_factor=1.05)


🔎 Label: quantum error correction (Degree=3)
✅ Validation RMSE (2023–2024): 0.0548
📊 True vs Predicted (Validation):
   2023: true=0.2338 | pred=0.1892
   2024: true=0.2761 | pred=0.2127

📈 Future Predictions & % Growth vs 2024:
   2025: score=0.3486 | growth vs 2024=26.27%
   2026: score=0.4154 | growth vs 2024=50.45%
   2027: score=0.4894 | growth vs 2024=77.24%
   2028: score=0.5706 | growth vs 2024=106.67%
