In [1]:
import pandas as pd
import numpy as np
import string
import nltk
import gensim
import wordcloud

from nltk import PorterStemmer
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer 
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


import matplotlib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


[nltk_data] Downloading package stopwords to c:\Users\fuzzi\AppData\Lo
[nltk_data]     cal\Programs\Python\Python311\lib\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to c:\Users\fuzzi\AppData\Loca
[nltk_data]     l\Programs\Python\Python311\lib\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to c:\Users\fuzzi\AppData\Lo
[nltk_data]     cal\Programs\Python\Python311\lib\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
### Reading the files
df = pd.read_csv('data/grants.csv')
df_amt = pd.read_csv('raw data/CA.US-Social.csv', encoding='ISO-8859-1')
df.rename(columns={'Project title': 'Project_Title'}, inplace=True)
merged_df = pd.merge(df, df_amt, on='Project_Title', how ='left')
merged_df = merged_df[['Project_Title', 'CIHR contribution', 'CIHR_Contribution', 'Abstract/Summary' ]]
merged_df = merged_df.replace('nan', np.nan)
merged_df = merged_df.dropna()
merged_df['CIHR_Contribution'] = merged_df['CIHR_Contribution'].replace({'\$': '', ',': ''}, regex=True).astype(int)
merged_df['CIHR contribution'] = merged_df['CIHR contribution'].astype(int)

mismatched_indices = merged_df[merged_df['CIHR contribution'] != merged_df['CIHR_Contribution']].index
# Drop those rows
merged_df = merged_df.drop(mismatched_indices)  
merged_df = merged_df[merged_df['CIHR contribution'] != 0]

In [4]:
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import word_tokenize
stop = set(stopwords.words('english'))
custom_stopwords = {'de', 'et', 'le', 'à', 'de', 'la', 'en', 'santé', 'pour', 'dans'}
stop.update(custom_stopwords)
punctuation = set(string.punctuation)
lemma = nltk.WordNetLemmatizer()

def clean(text):
    doc1 = text.lower()
    doc2 = doc1.split()
    doc3=[val for val in doc2 if val not in stop]
    doc4=" ".join([val for val in doc3]) 
    doc5=[val for val in doc4 if not val.isdigit()] #Exclude digits
    doc6="".join([val for val in doc5]) #compare to "".join(val for val in doc5)
    doc7=[val for val in doc6 if val not in punctuation]
    doc8="".join([val for val in doc7])
    doc9=doc8.split()
    doc10= [lemma.lemmatize(val) for val in doc9]
    doc11=" ".join([val for val in doc10])
    tokens = nltk.word_tokenize(doc11)
    return tokens

merged_df['clean_abstr'] = merged_df['Abstract/Summary'].apply(clean)
merged_df['Combined_text'] = merged_df['clean_abstr'].apply(lambda x: ' '.join(x))
clean_token = [token for tokens in merged_df['clean_abstr'] for token in tokens]

from gensim.models import Phrases
from gensim.models.phrases import Phraser

from gensim.corpora import Dictionary


### COllocation
abstracts = merged_df['clean_abstr'].tolist()

##Finding bigrams
bigram = gensim.models.Phrases(merged_df['clean_abstr'], min_count=5, threshold=10)
bigram_model = gensim.models.phrases.Phraser(bigram)
merged_df['tokenized_bigrams'] = [bigram_model[doc] for doc in merged_df['clean_abstr']]
merged_df['Combined_bigram'] = merged_df['tokenized_bigrams'].apply(lambda x: ' '.join(x))
merged_df['Grant_Amount'] = merged_df['CIHR contribution']
labels = ['Very Small','Small', 'Moderate', 'Large', 'Very Large']
bins = [merged_df['CIHR contribution'].min(), 
        merged_df['CIHR contribution'].quantile(0.2), 
        merged_df['CIHR contribution'].quantile(0.4), 
        merged_df['CIHR contribution'].quantile(0.6), 
        merged_df['CIHR contribution'].quantile(0.8), 
        merged_df['CIHR contribution'].max()]
merged_df['funding_category'] = pd.cut(merged_df['CIHR contribution'], bins=bins, labels=labels, include_lowest=True)
category_order = {"Very Small": 1, "Small": 2, "Moderate": 3, "Large": 4, "Very Large": 5}
merged_df['category_numeric'] = merged_df['funding_category'].map(category_order)
y = merged_df['category_numeric']
### Splitting test train data
from sklearn.preprocessing import StandardScaler
x_train, x_test, y_train, y_test = train_test_split(merged_df['Combined_bigram'], y, test_size=0.2, random_state=42)


tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(x_train)
X_test_tfidf = tfidf.transform(x_test)
X_train_dense = X_train_tfidf.toarray()
X_test_dense = X_test_tfidf.toarray()


[nltk_data] Downloading package wordnet to c:\Users\fuzzi\AppData\Loca
[nltk_data]     l\Programs\Python\Python311\lib\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


results = {}
gb_model = GradientBoostingClassifier(random_state=42)
knn_model = KNeighborsClassifier()
lda_model = LinearDiscriminantAnalysis()
def train_and_evaluate(model, param_grid=None):
    if param_grid:  # Perform grid search if param_grid is provided
        grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        if model in (gb_model, knn_model, lda_model):
            grid.fit(X_train_dense, y_train)
            best_model = grid.best_estimator_
            print(f"Best parameters: {grid.best_params_}")
        else:
            grid.fit(X_train_tfidf, y_train)
            best_model = grid.best_estimator_
            print(f"Best parameters: {grid.best_params_}")

    else:
        if model in (gb_model, knn_model, lda_model):
            best_model = model.fit(X_train_dense, y_train)
        else:
            best_model = model.fit(X_train_tfidf, y_train)
    # Cross-validation score
    if model in (gb_model, knn_model, lda_model):
        cv_score = cross_val_score(best_model, X_train_dense, y_train, cv=5, scoring='accuracy').mean()
        print(f"Cross-validation accuracy: {cv_score:.4f}")
        y_pred = best_model.predict(X_test_dense)
    else:
        cv_score = cross_val_score(best_model, X_train_tfidf, y_train, cv=5, scoring='accuracy').mean()
        print(f"Cross-validation accuracy: {cv_score:.4f}")    
        y_pred = best_model.predict(X_test_tfidf)
    # Evaluate on test data
    
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
        
    return best_model, cv_score

# 1. Multinomial Logistic Regression
print("\n1. Multinomial Logistic Regression")
logistic_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=5000)
logistic_param_grid = {'C': [0.1, 1, 10]}  # Regularization parameter
results['Logistic Regression'] = train_and_evaluate(logistic_model, logistic_param_grid)

# 2. Decision Tree
print("\n2. Decision Tree")
tree_model = DecisionTreeClassifier(random_state=42)
tree_param_grid = {'max_depth': [3, 5, 10], 'min_samples_split': [2, 5, 10]}
results['Decision Tree'] = train_and_evaluate(tree_model, tree_param_grid)

# 3. Random Forest
print("\n3. Random Forest")
rf_model = RandomForestClassifier(random_state=42)
rf_param_grid = {'n_estimators': [100, 200], 'max_depth': [5, 10, 20]}
results['Random Forest'] = train_and_evaluate(rf_model, rf_param_grid)
'''
# 4. Gradient Boosting
print("\n4. Gradient Boosting")
gb_model = GradientBoostingClassifier(random_state=42)
gb_param_grid = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]}
results['Gradient Boosting'] = train_and_evaluate(gb_model, gb_param_grid)
'''
# 5. Naive Bayes
print("\n5. Naive Bayes")
nb_model = MultinomialNB()
nb_param_grid = {'alpha': [0.1, 1, 10]}  # Smoothing parameter
results['Naive Bayes'] = train_and_evaluate(nb_model, nb_param_grid)

# 6. k-Nearest Neighbors (k-NN)
print("\n6. k-Nearest Neighbors (k-NN)")
knn_model = KNeighborsClassifier()
knn_param_grid = {'n_neighbors': [3, 5, 10], 'weights': ['uniform', 'distance']}
results['k-NN'] = train_and_evaluate(knn_model, knn_param_grid)

# 7. Linear Discriminant Analysis (LDA)
print("\n7. Linear Discriminant Analysis")
lda_model = LinearDiscriminantAnalysis()
lda_param_grid = None  # No hyperparameters to tune for LDA
results['LDA'] = train_and_evaluate(lda_model, lda_param_grid)

# Summary of cross-validation scores
print("\nSummary of Cross-Validation Scores:")
for model_name, (model, score) in results.items():
    print(f"{model_name}: {score:.4f}")


1. Multinomial Logistic Regression
Best parameters: {'C': 1}
Cross-validation accuracy: 0.4092
Classification Report:
              precision    recall  f1-score   support

           1       0.68      0.55      0.61       253
           2       0.43      0.51      0.46       253
           3       0.35      0.44      0.39       286
           4       0.34      0.18      0.23       211
           5       0.36      0.40      0.38       235

    accuracy                           0.42      1238
   macro avg       0.43      0.42      0.42      1238
weighted avg       0.43      0.42      0.42      1238

Confusion Matrix:
[[140  54  38   6  15]
 [ 24 129  54  12  34]
 [ 19  44 125  30  68]
 [ 10  35  76  37  53]
 [ 12  41  64  23  95]]

2. Decision Tree
Best parameters: {'max_depth': 10, 'min_samples_split': 2}
Cross-validation accuracy: 0.3306
Classification Report:
              precision    recall  f1-score   support

           1       0.74      0.26      0.39       253
           2   