In [89]:
import pandas as pd


df = pd.read_csv('../data/flashcards.csv')
df


Unnamed: 0,uid,firstName,lastName,course,year,interests,image
0,tMDtxyG4szWfrYxHp2mbB07ohnS2,Name,Surname,New User,Year,Using Butterfly,https://firebasestorage.googleapis.com/v0/b/bu...
1,xr2Cm0afi3VDrTXOtM9V9actzWn1,Name,Surname,New User,Year,Using Butterfly,https://firebasestorage.googleapis.com/v0/b/bu...
2,bG27GB3oQRe5ldjYf6IhkTTgZw62,James,Surname,New User,1st Year,"Eating out, Gym, Drumming and Gaming and Talk...",https://firebasestorage.googleapis.com/v0/b/bu...
3,Keq713gn6jgspUSZ8DsG56uu3k03,faridah,babirye,economics,2nd Year,gym,https://firebasestorage.googleapis.com/v0/b/bu...
4,o6yE2V0s9XYkTiPKlLyWRg4domG2,Shuaiyan,Ouyang,Mechanical Engineering,1st Year,Games,https://firebasestorage.googleapis.com/v0/b/bu...
...,...,...,...,...,...,...,...
2686,IAUtukYMmIZHLLe3ajZs9u9Ail62,Jackie,Cecchetto,New User,Postgrad,"canadian girl thats into beauty, fashion & cars",https://firebasestorage.googleapis.com/v0/b/bu...
2687,k3YhgoTswCUbdoVFb5MHCMWJauF3,Kashaf,Saeed,Master of Public Health,Postgrad,"Bibliophile, Love to visit and explore places ...",https://firebasestorage.googleapis.com/v0/b/bu...
2688,U1vBPgzxqtYsIwrCSc7R1vesmb23,rizka,amelia,supply chain & logistics management,Postgrad,"traveling, movies, watching football, f1 & bad...",https://firebasestorage.googleapis.com/v0/b/bu...
2689,Wj9J72fnPnaB0BuaThMhqCdtVYA2,Aaftab,Randhawa,msc. management,Postgrad,aaaaaaghhhh,https://firebasestorage.googleapis.com/v0/b/bu...


In [104]:

# Feature Engineering

# 1. Drop uids
processed_df = df.drop('uid', axis=1)

# 2. Drop noisy entries (Data Sanitisation)
processed_df = processed_df.loc[(processed_df.firstName != 'Name') & (processed_df.lastName != 'Surname') & (processed_df.course != 'New User') & (processed_df.year != 'Year')]

# 3. Create one-hot encoding for year
years = pd.DataFrame({'year': ['1st Year', '2nd Year', '3rd Year', '4th Year', 'Postgrad']})
one_hot_encoding = pd.get_dummies(years, columns=['year']).astype(int)
one_hot_numpy = one_hot_encoding.values

    # Create a mapping from year text to its corresponding one-hot encoded numpy array
year_to_numpy_map = {year: one_hot_numpy[i] for i, year in enumerate(years.year)}

    # Replace each entry in the 'year' column
processed_df['year'] = processed_df['year'].apply(lambda year: year_to_numpy_map[year])

# 4. Concatenate first and last name
processed_df.insert(0, 'name', processed_df.firstName + ' ' + processed_df.lastName)
processed_df.drop(columns=['firstName', 'lastName'], axis=1, inplace=True)
processed_df


Unnamed: 0,name,course,year,interests,image
3,faridah babirye,economics,"[0, 1, 0, 0, 0]",gym,https://firebasestorage.googleapis.com/v0/b/bu...
4,Shuaiyan Ouyang,Mechanical Engineering,"[1, 0, 0, 0, 0]",Games,https://firebasestorage.googleapis.com/v0/b/bu...
5,Leah Preston,Psychology in education,"[1, 0, 0, 0, 0]","baking, Disney, shopping, Nintendo, snorkeling...",https://firebasestorage.googleapis.com/v0/b/bu...
6,Tefo Boyze Kepaletswe,MORSE,"[1, 0, 0, 0, 0]","music, art",https://firebasestorage.googleapis.com/v0/b/bu...
7,adz maroc,Cybersecurity,"[1, 0, 0, 0, 0]",ðŸ‡²ðŸ‡¦,https://firebasestorage.googleapis.com/v0/b/bu...
...,...,...,...,...,...
2685,riddhi parmar,public health,"[0, 0, 0, 0, 1]","drawing , photography",https://firebasestorage.googleapis.com/v0/b/bu...
2687,Kashaf Saeed,Master of Public Health,"[0, 0, 0, 0, 1]","Bibliophile, Love to visit and explore places ...",https://firebasestorage.googleapis.com/v0/b/bu...
2688,rizka amelia,supply chain & logistics management,"[0, 0, 0, 0, 1]","traveling, movies, watching football, f1 & bad...",https://firebasestorage.googleapis.com/v0/b/bu...
2689,Aaftab Randhawa,msc. management,"[0, 0, 0, 0, 1]",aaaaaaghhhh,https://firebasestorage.googleapis.com/v0/b/bu...


In [108]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import gensim.downloader as api
import re

# Preprocessing steps
# Modify the preprocessing function to handle non-string (e.g., NaN) values
def preprocess_text_v2(text):
    # Check if the text is a string
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove punctuation and numbers
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
    else:
        # If not a string, return an empty string
        text = ''
    return text

# Apply the modified preprocessing function to the 'interests' field
processed_df['name'] = processed_df['name'].apply(preprocess_text_v2)
processed_df['course'] = processed_df['course'].apply(preprocess_text_v2)
processed_df['interests'] = processed_df['interests'].apply(preprocess_text_v2)


# # 1. Text Features: Use TF-IDF for 'course' column
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(processed_df['course'])
processed_df['course'] = tfidf_matrix.toarray().tolist()




processed_df

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [111]:
# Load Google's pretrained Word2Vec model
model = api.load("word2vec-google-news-300")  # This can take some time and requires a lot of RAM

# Function to vectorize a name
def vectorize_name(name, model):
    vectors = [model[word] for word in name.split() if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Apply the function to your DataFrame
df['name_vector'] = df['name'].apply(lambda x: vectorize_name(x, model))





In [None]:

# from sklearn.pipeline import Pipeline
# from sklearn.cluster import KMeans
# import numpy as np
# # Pipeline for preprocessing and clustering
# k = 5  # Number of clusters, can be tuned
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('scaler', StandardScaler(with_mean=False)),  # StandardScaler is used because TF-IDF produces sparse matrix
#     ('clusterer', KMeans(n_clusters=k, random_state=42))
# ])

# # Fitting the model
# pipeline.fit(processed_df)

# # Adding the cluster labels to the original dataframe for interpretation
# processed_df['cluster'] = pipeline.predict(processed_df)
# processed_df.head()