# Train Test Split

In [4]:
# 📦 Import Required Libraries
import os
import random
import pandas as pd

# 📍 Set base paths
facebook_edges_path = 'data/facebook/edges.txt'
pokec_relationships_path = 'data/pokec/relationships.txt'

# Create folders if not exist
os.makedirs('data/facebook/', exist_ok=True)
os.makedirs('data/pokec/', exist_ok=True)

# 📚 Split Function for Small Datasets (In-Memory)
def split_edges(input_path, output_folder, train_filename, test_filename, test_size=0.2, seed=42):
    """
    Splits smaller edge files into train/test sets using in-memory shuffling.
    """
    random.seed(seed)

    # Step 1: Load edges into memory
    with open(input_path, 'r', encoding='utf-8') as f:
        edges = [tuple(map(int, line.strip().split())) for line in f]

    print(f"📄 Total edges loaded from {input_path}: {len(edges)}")

    # Step 2: Shuffle and split
    random.shuffle(edges)
    split_idx = int((1 - test_size) * len(edges))
    train_edges = edges[:split_idx]
    test_edges = edges[split_idx:]

    print(f"🧪 Train edges: {len(train_edges)}, Test edges: {len(test_edges)}")

    # Step 3: Save to files
    with open(os.path.join(output_folder, train_filename), 'w', encoding='utf-8') as f:
        for u, v in train_edges:
            f.write(f"{u} {v}\n")

    with open(os.path.join(output_folder, test_filename), 'w', encoding='utf-8') as f:
        for u, v in test_edges:
            f.write(f"{u} {v}\n")

    print(f"✅ Train/Test files saved to: {output_folder}\n")

# 📚 Split Function for Large Datasets (Streaming)
def split_edges_streaming(input_path, output_folder, train_filename, test_filename, test_size=0.2, seed=42):
    """
    Memory-efficient splitting for large edge files.
    """
    random.seed(seed)

    # Step 1: Count lines
    with open(input_path, 'r', encoding='utf-8') as f:
        total_lines = sum(1 for _ in f)

    print(f"📄 Total edges in {input_path}: {total_lines}")

    # Step 2: Randomly sample test indices
    test_count = int(total_lines * test_size)
    test_indices = set(random.sample(range(total_lines), test_count))

    # Step 3: Write to files on-the-fly
    train_path = os.path.join(output_folder, train_filename)
    test_path = os.path.join(output_folder, test_filename)

    with open(input_path, 'r', encoding='utf-8') as fin, \
         open(train_path, 'w', encoding='utf-8') as ftrain, \
         open(test_path, 'w', encoding='utf-8') as ftest:

        for idx, line in enumerate(fin):
            (ftest if idx in test_indices else ftrain).write(line)

    print(f"🧪 Train edges: {total_lines - test_count}, Test edges: {test_count}")
    print(f"✅ Train/Test files saved to: {output_folder}\n")

# 📦 Split Facebook dataset (small - in-memory)
split_edges(
    input_path=facebook_edges_path,
    output_folder='data/facebook/',
    train_filename='train_edges.txt',
    test_filename='test_edges.txt',
    test_size=0.2,
    seed=42
)

# 📦 Split Pokec dataset (large - streaming)
split_edges_streaming(
    input_path=pokec_relationships_path,
    output_folder='data/pokec/',
    train_filename='train_relationships.txt',
    test_filename='test_relationships.txt',
    test_size=0.2,
    seed=42
)

print("🎯 Done! Now both Facebook and Pokec datasets have Train/Test splits ready!")


📄 Total edges loaded from data/facebook/edges.txt: 170174
🧪 Train edges: 136139, Test edges: 34035
✅ Train/Test files saved to: data/facebook/

📄 Total edges in data/pokec/relationships.txt: 30622564
🧪 Train edges: 24498052, Test edges: 6124512
✅ Train/Test files saved to: data/pokec/

🎯 Done! Now both Facebook and Pokec datasets have Train/Test splits ready!


# Dummy Models

In [1]:
# 📦 Import Required Libraries
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from surprise import Dataset, Reader, SVD

# 📍 Paths
facebook_saved_models_path = 'models/saved_models/facebook/'
pokec_saved_models_path = 'models/saved_models/pokec/'

# 📂 Create folders if not exist
os.makedirs(facebook_saved_models_path, exist_ok=True)
os.makedirs(pokec_saved_models_path, exist_ok=True)

# 📚 Create Dummy Feature Data
X_dummy = np.random.rand(100, 10)  # 100 users, 10 features

# -----------------------------------
# 1. Create Dummy KNN Model
knn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
knn_model.fit(X_dummy)

joblib.dump(knn_model, facebook_saved_models_path + 'knn_model.pkl')
joblib.dump(knn_model, pokec_saved_models_path + 'knn_model.pkl')
print("✅ Dummy KNN model saved for Facebook and Pokec")

# -----------------------------------
# 2. Create Dummy KMeans Model
kmeans_model = KMeans(n_clusters=5, random_state=42)
kmeans_model.fit(X_dummy)

joblib.dump(kmeans_model, facebook_saved_models_path + 'kmeans_model.pkl')
joblib.dump(kmeans_model, pokec_saved_models_path + 'kmeans_model.pkl')
print("✅ Dummy KMeans model saved for Facebook and Pokec")

# -----------------------------------
# 3. Create Dummy SVD Model (for Matrix Factorization)

# Surprise needs interaction data, so we create random user-user interactions
dummy_interactions = pd.DataFrame({
    'user_id': np.random.randint(0, 100, 500),
    'friend_id': np.random.randint(0, 100, 500),
    'interaction': np.ones(500)  # Interaction = 1 (dummy)
})

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(dummy_interactions[['user_id', 'friend_id', 'interaction']], reader)
trainset = data.build_full_trainset()

svd_model = SVD()
svd_model.fit(trainset)

joblib.dump(svd_model, facebook_saved_models_path + 'svd_model.pkl')
joblib.dump(svd_model, pokec_saved_models_path + 'svd_model.pkl')
print("✅ Dummy SVD model saved for Facebook and Pokec")

# -----------------------------------
# 4. Create Dummy Cosine Similarity (optional)
# We'll treat cosine similarity later inside backend logic, so no need to save it as a separate model.
# If you want, save dummy cosine model as empty dict.
cosine_similarity_dummy = {"info": "dummy placeholder"}
joblib.dump(cosine_similarity_dummy, facebook_saved_models_path + 'cosine_similarity_model.pkl')
joblib.dump(cosine_similarity_dummy, pokec_saved_models_path + 'cosine_similarity_model.pkl')
print("✅ Dummy Cosine Similarity model saved for Facebook and Pokec")

print("\n🎯 All Dummy Models Created Successfully!")


✅ Dummy KNN model saved for Facebook and Pokec
✅ Dummy KMeans model saved for Facebook and Pokec
✅ Dummy SVD model saved for Facebook and Pokec
✅ Dummy Cosine Similarity model saved for Facebook and Pokec

🎯 All Dummy Models Created Successfully!


# Visualisation

In [15]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import re
from html import unescape

In [4]:
# Load Profiles
profiles = pd.read_csv('data/pokec/profiles.txt', sep='\t', header=None)

# Load Relationships (edges)
relationships = pd.read_csv('data/pokec/relationships.txt', sep='\t', header=None, names=['user_id', 'friend_id'])

# Show samples
print("Profiles Sample:")
display(profiles.head())

print("\nRelationships Sample:")
display(relationships.head())

Profiles Sample:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,1,1,14,1.0,"zilinsky kraj, zilina",2012-05-25 11:20:00.0,2005-04-03 00:00:00.0,26.0,"185 cm, 90 kg",it,...,,,,,,,,,,
1,2,1,62,0.0,"zilinsky kraj, kysucke nove mesto",2012-05-25 23:08:00.0,2007-11-30 00:00:00.0,0.0,"166 cm, 58 kg",,...,,,,,,,,,,
2,16,1,64,1.0,"zilinsky kraj, kysucke nove mesto",2012-05-25 23:19:40.0,2008-05-18 00:00:00.0,23.0,"173 cm, 70 kg",najvatcsej firme na svete urad prace,...,,,,,,,,,,
3,3,0,38,1.0,"bratislavsky kraj, bratislava - karlova ves",2012-05-10 18:05:00.0,2010-05-23 00:00:00.0,29.0,,"reklamy a medii, sluzieb a obchodu",...,,,,,,,,,,
4,4,1,12,0.0,"banskobystricky kraj, brezno",2011-12-29 12:25:00.0,2011-12-29 00:00:00.0,26.0,,,...,,,,,,,,,,



Relationships Sample:


Unnamed: 0,user_id,friend_id
0,1,13
1,1,11
2,1,6
3,1,3
4,1,4


In [5]:
# Based on your previous field analysis
profiles.columns = [
    "user_id", "profile_visibility", "completion_percent", "gender", "region", 
    "last_login", "registration_date", "age", "height_weight", "education_occupation",
    "languages", "hobbies", "evening_activity", "pets", "body_type", "health",
    "eye_color", "hair_color", "hair_length", "education_level", "ideal_evening_activity","favorite_colors",
    "smoking", "drinking", "zodiac_sign", "looking_for", "motto", "partner_qualities",
    "relationship_status", "children_now", "children_future", "favorite_movie_genres",
    "free_time_activities", "music_genres", "ideal_date", "pastimes", "favorite_cuisine",
    "joined_clubs", "concert_attendance", "sports_activities", "favorite_sports_to_watch", "current_occupation_or_education", 
    "favorite_book_genres", "community_clubs", "music_clubs_joined", "automobile_clubs_joined", "national_pride_clubs",
    "love_and_friendship_clubs", "art_and_life_clubs", "community_and_hobby_clubs", "social_fun_clubs", 
    "technology_and_internet_interests", "school_and_education_experiences", "sports_and_athletics_interests", 
    "favorite_movies_and_tvshows", "favorite_places_and_travel",
    "life_mottos_and_fun_clubs", "brands_and_online_platforms", "life_quotes_and_personality_clubs", "extra_4"
]

In [6]:
profiles = profiles.drop(columns=['extra_4'])

In [7]:
profiles.head(5)

Unnamed: 0,user_id,profile_visibility,completion_percent,gender,region,last_login,registration_date,age,height_weight,education_occupation,...,community_and_hobby_clubs,social_fun_clubs,technology_and_internet_interests,school_and_education_experiences,sports_and_athletics_interests,favorite_movies_and_tvshows,favorite_places_and_travel,life_mottos_and_fun_clubs,brands_and_online_platforms,life_quotes_and_personality_clubs
0,1,1,14,1.0,"zilinsky kraj, zilina",2012-05-25 11:20:00.0,2005-04-03 00:00:00.0,26.0,"185 cm, 90 kg",it,...,,,,,,,,,,
1,2,1,62,0.0,"zilinsky kraj, kysucke nove mesto",2012-05-25 23:08:00.0,2007-11-30 00:00:00.0,0.0,"166 cm, 58 kg",,...,,,,,,,,,,
2,16,1,64,1.0,"zilinsky kraj, kysucke nove mesto",2012-05-25 23:19:40.0,2008-05-18 00:00:00.0,23.0,"173 cm, 70 kg",najvatcsej firme na svete urad prace,...,,,,,,,,,,
3,3,0,38,1.0,"bratislavsky kraj, bratislava - karlova ves",2012-05-10 18:05:00.0,2010-05-23 00:00:00.0,29.0,,"reklamy a medii, sluzieb a obchodu",...,,,,,,,,,,
4,4,1,12,0.0,"banskobystricky kraj, brezno",2011-12-29 12:25:00.0,2011-12-29 00:00:00.0,26.0,,,...,,,,,,,,,,


In [8]:
columns_to_clean = [
    "joined_clubs", "community_clubs", "music_clubs_joined", "automobile_clubs_joined",
    "national_pride_clubs", "love_and_friendship_clubs", "art_and_life_clubs", "community_and_hobby_clubs",
    "social_fun_clubs", "technology_and_internet_interests", "school_and_education_experiences",
    "sports_and_athletics_interests", "favorite_movies_and_tvshows", "favorite_places_and_travel",
    "life_mottos_and_fun_clubs", "brands_and_online_platforms", "life_quotes_and_personality_clubs"
]

# --- Step 3: Function to clean HTML tags + decode entities
def clean_html(text):
    if pd.isna(text):
        return text
    # 1. Remove all HTML tags
    text = re.sub(r'<.*?>', '', text)
    # 2. Decode HTML entities (like &shy;, &bull;, etc.)
    text = unescape(text)
    # 3. Replace weird line breaks or whitespace issues
    text = text.replace('\n', ' ').replace('\r', ' ').strip()
    return text

# --- Step 4: Apply cleaning
for col in columns_to_clean:
    profiles[col] = profiles[col].apply(clean_html)

print("✅ All HTML tags and entities removed!")


# --- Step 5: OPTIONAL: Display a sample to manually inspect
print("\n🎯 Sample after cleaning:")
display(profiles[columns_to_clean].sample(5))

✅ All HTML tags and entities removed!

🎯 Sample after cleaning:


Unnamed: 0,joined_clubs,community_clubs,music_clubs_joined,automobile_clubs_joined,national_pride_clubs,love_and_friendship_clubs,art_and_life_clubs,community_and_hobby_clubs,social_fun_clubs,technology_and_internet_interests,school_and_education_experiences,sports_and_athletics_interests,favorite_movies_and_tvshows,favorite_places_and_travel,life_mottos_and_fun_clubs,brands_and_online_platforms,life_quotes_and_personality_clubs
408837,,,,,,,,,,,,,,,,,
248628,,,hip hop,,,"♥ ♥ ♥ zijem pre ludi , ktory ziju pre mna :* ...",,,,,,,,,,,
942288,,,,,,,,,,,,,,,,,
1055942,,,,,,,,,,,,,,,,,
1080152,,,,,,,,,,,,,,,,,


In [9]:
profiles.head()

Unnamed: 0,user_id,profile_visibility,completion_percent,gender,region,last_login,registration_date,age,height_weight,education_occupation,...,community_and_hobby_clubs,social_fun_clubs,technology_and_internet_interests,school_and_education_experiences,sports_and_athletics_interests,favorite_movies_and_tvshows,favorite_places_and_travel,life_mottos_and_fun_clubs,brands_and_online_platforms,life_quotes_and_personality_clubs
0,1,1,14,1.0,"zilinsky kraj, zilina",2012-05-25 11:20:00.0,2005-04-03 00:00:00.0,26.0,"185 cm, 90 kg",it,...,,,,,,,,,,
1,2,1,62,0.0,"zilinsky kraj, kysucke nove mesto",2012-05-25 23:08:00.0,2007-11-30 00:00:00.0,0.0,"166 cm, 58 kg",,...,,,,,,,,,,
2,16,1,64,1.0,"zilinsky kraj, kysucke nove mesto",2012-05-25 23:19:40.0,2008-05-18 00:00:00.0,23.0,"173 cm, 70 kg",najvatcsej firme na svete urad prace,...,,,,,,,,,,
3,3,0,38,1.0,"bratislavsky kraj, bratislava - karlova ves",2012-05-10 18:05:00.0,2010-05-23 00:00:00.0,29.0,,"reklamy a medii, sluzieb a obchodu",...,,,,,,,,,,
4,4,1,12,0.0,"banskobystricky kraj, brezno",2011-12-29 12:25:00.0,2011-12-29 00:00:00.0,26.0,,,...,,,,,,,,,,


In [10]:
# Step 1: Convert to datetime
profiles['last_login'] = pd.to_datetime(profiles['last_login'], errors='coerce')
profiles['registration_date'] = pd.to_datetime(profiles['registration_date'], errors='coerce')

# Step 2: Extract time first
profiles['last_login_time'] = profiles['last_login'].dt.time
profiles['registration_time'] = profiles['registration_date'].dt.time

# Step 3: Extract date next
profiles['last_login_date'] = profiles['last_login'].dt.date
profiles['registration_date'] = profiles['registration_date'].dt.date

# Step 4: Drop the messy combined column
profiles = profiles.drop(columns=['last_login'])

# Step 5: Reorder Columns properly
desired_order = [
    "user_id", "profile_visibility", "completion_percent", "gender", "region", 
    "last_login_date", "last_login_time", "registration_date", "registration_time", 
    "age", "height_weight", "education_occupation", "languages", "hobbies", "evening_activity",
    "pets", "body_type", "health", "eye_color", "hair_color", "hair_length", "education_level", 
    "ideal_evening_activity", "favorite_colors", "smoking", "drinking", "zodiac_sign", 
    "looking_for", "motto", "partner_qualities", "relationship_status", "children_now", 
    "children_future", "favorite_movie_genres", "free_time_activities", "music_genres", 
    "ideal_date", "pastimes", "favorite_cuisine", "joined_clubs", "concert_attendance", 
    "sports_activities", "favorite_sports_to_watch", "current_occupation_or_education", 
    "favorite_book_genres", "community_clubs", "music_clubs_joined", "automobile_clubs_joined", 
    "national_pride_clubs", "love_and_friendship_clubs", "art_and_life_clubs", 
    "community_and_hobby_clubs", "social_fun_clubs", "technology_and_internet_interests", 
    "school_and_education_experiences", "sports_and_athletics_interests", 
    "favorite_movies_and_tvshows", "favorite_places_and_travel", "life_mottos_and_fun_clubs", 
    "brands_and_online_platforms", "life_quotes_and_personality_clubs"
]

profiles = profiles[desired_order]

# Step 6: Save properly
profiles.to_csv('data/pokec/profiles.csv', index=False)

print("✅ profiles saved correctly with proper column order!")


✅ profiles saved correctly with proper column order!


In [11]:
profiles_xlx = pd.read_csv('data/pokec/profiles.csv')
profiles_xlx.head()

Unnamed: 0,user_id,profile_visibility,completion_percent,gender,region,last_login_date,last_login_time,registration_date,registration_time,age,...,community_and_hobby_clubs,social_fun_clubs,technology_and_internet_interests,school_and_education_experiences,sports_and_athletics_interests,favorite_movies_and_tvshows,favorite_places_and_travel,life_mottos_and_fun_clubs,brands_and_online_platforms,life_quotes_and_personality_clubs
0,1,1,14,1.0,"zilinsky kraj, zilina",2012-05-25,11:20:00,2005-04-03,00:00:00,26.0,...,,,,,,,,,,
1,2,1,62,0.0,"zilinsky kraj, kysucke nove mesto",2012-05-25,23:08:00,2007-11-30,00:00:00,0.0,...,,,,,,,,,,
2,16,1,64,1.0,"zilinsky kraj, kysucke nove mesto",2012-05-25,23:19:40,2008-05-18,00:00:00,23.0,...,,,,,,,,,,
3,3,0,38,1.0,"bratislavsky kraj, bratislava - karlova ves",2012-05-10,18:05:00,2010-05-23,00:00:00,29.0,...,,,,,,,,,,
4,4,1,12,0.0,"banskobystricky kraj, brezno",2011-12-29,12:25:00,2011-12-29,00:00:00,26.0,...,,,,,,,,,,


In [12]:
# Step 1: Keep only non-null registration_time
non_null_times = profiles[profiles['registration_time'].notna()]

# Step 2: Find those not equal to 00:00:00
non_midnight_times = non_null_times[non_null_times['registration_time'] != pd.to_datetime('00:00:00').time()]

# Step 3: Print results
print(f"✅ Total non-midnight registration times found ➔ {len(non_midnight_times)} rows")

# Optional: See a few examples
display(non_midnight_times[['user_id', 'registration_time']].head(10))

✅ Total non-midnight registration times found ➔ 0 rows


Unnamed: 0,user_id,registration_time


In [13]:
# Step 4: Drop registration_time safely
profiles = profiles.drop(columns=['registration_time'])

print("✅ Dropped registration_time because it contained only 00:00:00 or was null.")

✅ Dropped registration_time because it contained only 00:00:00 or was null.


In [14]:
import pandas as pd
import numpy as np
import re

# --- Step 1: Smart Split Function
def smart_split_height_weight(value):
    if pd.isna(value) or value == 'Missing value':
        return pd.Series([np.nan, np.nan, np.nan])

    height = np.nan
    weight = np.nan
    comments = []

    parts = re.split(r'[,\n]', value)
    
    for part in parts:
        part = part.strip()
        if 'cm' in part:
            match = re.search(r'(\d+\.?\d*)', part)
            if match:
                height = float(match.group(1))
        elif 'kg' in part:
            match = re.search(r'(\d+\.?\d*)', part)
            if match:
                weight = float(match.group(1))
        else:
            if part:
                comments.append(part)

    comment_text = ', '.join(comments) if comments else np.nan
    return pd.Series([height, weight, comment_text])

# --- Step 2: Find index of 'height_weight' column
height_weight_idx = profiles.columns.get_loc('height_weight')

# --- Step 3: Apply smart split
new_cols = profiles['height_weight'].apply(smart_split_height_weight)
new_cols.columns = ['height', 'weight', 'height_weight_comment']

# --- Step 4: Drop old 'height_weight'
profiles = profiles.drop(columns=['height_weight'])

# --- Step 5: Insert new columns at the correct position
for i, col in enumerate(['height', 'weight', 'height_weight_comment']):
    profiles.insert(height_weight_idx + i, col, new_cols.iloc[:, i])

print("✅ Successfully split and inserted new columns correctly!")

✅ Successfully split and inserted new columns correctly!


In [16]:
profiles.to_csv('data/pokec/profiles.csv', index=False)
print("✅ Saved as profiles.csv")

✅ Saved as profiles.csv


In [None]:
# -----------------------------
# 6. Save Model and Components
# -----------------------------
save_path = '../models/saved_models/pokec/'
os.makedirs(save_path, exist_ok=True)

joblib.dump(kmeans, save_path + 'kmeans_model.pkl')
np.save(save_path + 'kmeans_features.npy', X)
joblib.dump(user_ids, save_path + 'user_ids.pkl')
joblib.dump(scaler, save_path + 'scaler.pkl')
joblib.dump(encoder, save_path + 'encoder.pkl')
joblib.dump(vectorizer, save_path + 'tfidf_vectorizer.pkl')

print(f'✅ Model and features saved to {save_path}')
