In [1]:
# build_profiles.py
HOTELS_FILE = "hotels_processed.csv"
USERS_FILE  = "UserData.csv"
OUT_FILE    = "Group2_Part1_Profile11.csv"   

# 1) Load hotel features into a dictionary
item_features = {}  # itemid -> list of features
with open(HOTELS_FILE, "r", encoding="utf-8") as f:
    header = f.readline() 
    for line in f:
        parts = line.strip().split(",")
        if len(parts) < 4:
            continue
        iid = int(parts[0])
        features = parts[3].split("|") if parts[3] else []
        item_features[iid] = features

# 2) Load user->visited items
user2items = {}  # userid -> list of itemids
with open(USERS_FILE, "r", encoding="utf-8") as f:
    header = f.readline().strip().split(",")
    for line in f:
        row = line.strip().split(",")
        if len(row) < 2: 
            continue
        uid = int(row[0])
        iid = int(row[1])
        if uid not in user2items:
            user2items[uid] = []
        if iid not in user2items[uid]:
            user2items[uid].append(iid)

#  Pick first 5 users
user_ids = sorted(user2items.keys())[:5]

#  Build profiles (union of all features of visited hotels)
user_profiles = {}
for uid in user_ids:
    feats = []
    for iid in user2items[uid]:
        if iid in item_features:
            for ftr in item_features[iid]:
                if ftr not in feats:
                    feats.append(ftr)
    user_profiles[uid] = feats

# 5) Write profiles to CSV
with open(OUT_FILE, "w", encoding="utf-8") as f:
    f.write("userid,features\n")
    for uid in user_ids:
        feats_str = "|".join(user_profiles[uid])
        f.write(f"{uid},{feats_str}\n")

print("Profiles saved to", OUT_FILE)
#kldkl

Profiles saved to Group2_Part1_Profile11.csv


In [2]:
# jaccard_matrix.py
HOTELS_FILE = "hotels_processed.csv"
PROFILES_FILE = "Group2_Part1_Profile11.csv"   
USERS_FILE = "UserData.csv"
OUT_FILE = "Group2_Part1_SimMatrix12.csv"

# Load hotel features
item_features = {}
with open(HOTELS_FILE, "r", encoding="utf-8") as f:
    f.readline() 
    for line in f:
        parts = line.strip().split(",")
        if len(parts) < 4:
            continue
        iid = int(parts[0])
        feats = parts[3].split("|") if parts[3] else []
        item_features[iid] = feats

all_items = sorted(item_features.keys())

# 2) Load user profiles
user_profiles = {}
with open(PROFILES_FILE, "r", encoding="utf-8") as f:
    f.readline()  
    for line in f:
        uid_str, feats_str = line.strip().split(",", 1)
        uid = int(uid_str)
        feats = feats_str.split("|") if feats_str else []
        user_profiles[uid] = feats

# 3) Load visited hotels
user2items = {}
with open(USERS_FILE, "r", encoding="utf-8") as f:
    f.readline()  # skip header
    for line in f:
        row = line.strip().split(",")
        if len(row) < 2: continue
        uid = int(row[0])
        iid = int(row[1])
        if uid not in user2items:
            user2items[uid] = []
        if iid not in user2items[uid]:
            user2items[uid].append(iid)

# keep the 5 users from profiles file
user_ids = sorted(user_profiles.keys())

# 4) Jaccard function
def jaccard(listA, listB):
    setA = set(listA)
    setB = set(listB)
    if not setA and not setB:
        return 0.0
    inter = 0
    for x in setA:
        if x in setB:
            inter += 1
    union = len(setA) + len(setB) - inter
    if union == 0:
        return 0.0
    return inter / union

# 5) Build matrix and write file

with open(OUT_FILE, "w", encoding="utf-8") as f:

    f.write("userid")
    for iid in all_items:
        f.write("," + str(iid))
    f.write("\n")

    for uid in user_ids:
        f.write(str(uid))
        for iid in all_items:
            if iid in user2items.get(uid, []):
                f.write(",")   # leave blank for visited
            else:
                score = jaccard(user_profiles[uid], item_features[iid])
                f.write("," + "{:.6f}".format(score))
        f.write("\n")

print("Similarity matrix saved to", OUT_FILE)

Similarity matrix saved to Group2_Part1_SimMatrix12.csv


In [3]:
# top5_recommendations.py

HOTELS_FILE = "hotels_processed.csv"
SIM_FILE    = "Group2_Part1_SimMatrix12.csv"  
OUT_FILE    = "Group2_Part1_Recommendation13.csv"

# 1) Build a lookup: itemid -> (hotelid, hotelname)

item_lookup = {}  
with open(HOTELS_FILE, "r", encoding="utf-8") as f:
    header = f.readline().strip().split(",")  # itemid,hotelid,hotelname,features
    for line in f:
        parts = line.strip().split(",")
        if len(parts) < 3:
            continue
        try:
            iid = int(parts[0])
        except:
            continue
        hid_str = parts[1] if len(parts) > 1 else ""
        hname   = parts[2] if len(parts) > 2 else ""
        try:
            hid = int(hid_str)
        except:
            hid = 0
        item_lookup[iid] = (hid, hname)

# 2) Read similarity matrix and compute Top-5 per user

recommendations = []  

with open(SIM_FILE, "r", encoding="utf-8") as f:
    header = f.readline().strip().split(",")
    if len(header) <= 1:
        pass
    else:
        item_ids = []
        for c in header[1:]:
            try:
                item_ids.append(int(c))
            except:
                item_ids.append(None) 

        for line in f:
            row = line.strip().split(",")
            if len(row) == 0:
                continue
            try:
                uid = int(row[0])
            except:
                continue

            # Collect (score, itemid) for non-visited cells (non-empty)
            scored_items = []
            for i in range(1, len(row)):
                iid = item_ids[i - 1]  # align with header
                if iid is None:
                    continue
                cell = row[i].strip()
                if cell == "":  # visited -> left blank
                    continue
                # parse similarity
                try:
                    s = float(cell)
                except:
                    continue
                scored_items.append((s, iid))

            # Sort: highest similarity first, then smaller itemid
            # (negative score for descending sort without imports)
            for j in range(len(scored_items) - 1):
                # simple bubble-sort style to avoid imports; fine for small lists
                for k in range(len(scored_items) - 1 - j):
                    s1, id1 = scored_items[k]
                    s2, id2 = scored_items[k + 1]
                    swap = False
                    if s2 > s1:
                        swap = True
                    elif s2 == s1 and id2 < id1:
                        swap = True
                    if swap:
                        tmp = scored_items[k]
                        scored_items[k] = scored_items[k + 1]
                        scored_items[k + 1] = tmp

            # Take Top-5
            limit = 5 if len(scored_items) >= 5 else len(scored_items)
            for t in range(limit):
                s, iid = scored_items[t]
                hid, hname = item_lookup.get(iid, (0, ""))
                recommendations.append((uid, iid, hid, hname, s))

# 3) Write output CSV
with open(OUT_FILE, "w", encoding="utf-8") as f:
    f.write("userid,itemid,hotelid,hotelname,similarity\n")
    for (uid, iid, hid, hname, s) in recommendations:
        # Ensure commas in hotel names won't break CSV too badly:
        # (basic handling; if you need full CSV quoting you'd implement manual quoting)
        hname_clean = hname.replace("\n", " ").replace("\r", " ")
        f.write(str(uid)); f.write(",")
        f.write(str(iid)); f.write(",")
        f.write(str(hid)); f.write(",")
        f.write(hname_clean); f.write(",")
        f.write("{:.6f}".format(s)); f.write("\n")

print("Top-5 recommendations saved to", OUT_FILE)

Top-5 recommendations saved to Group2_Part1_Recommendation13.csv


In [5]:
# build_profiles.py
HOTELS_FILE = "hotels_processed.csv"
USERS_FILE  = "UserData.csv"
OUT_FILE    = "Group2_Part1_Profile11.csv"   

# 1) Load hotel features into a dictionary
item_features = {}  # itemid -> list of features
with open(HOTELS_FILE, "r", encoding="utf-8") as f:
    header = f.readline() 
    for line in f:
        parts = line.strip().split(",")
        if len(parts) < 4:
            continue
        iid = int(parts[0])
        features = parts[3].split("|") if parts[3] else []
        item_features[iid] = features

# 2) Load user->visited items
user2items = {}  # userid -> list of itemids
with open(USERS_FILE, "r", encoding="utf-8") as f:
    header = f.readline().strip().split(",")
    for line in f:
        row = line.strip().split(",")
        if len(row) < 2: 
            continue
        uid = int(row[0])
        iid = int(row[1])
        if uid not in user2items:
            user2items[uid] = []
        if iid not in user2items[uid]:
            user2items[uid].append(iid)

#  Pick first 5 users
user_ids = sorted(user2items.keys())[:5]

#  Build profiles (union of all features of visited hotels)
user_profiles = {}
for uid in user_ids:
    feats = []
    for iid in user2items[uid]:
        if iid in item_features:
            for ftr in item_features[iid]:
                if ftr not in feats:
                    feats.append(ftr)
    user_profiles[uid] = feats

# 5) Write profiles to CSV
with open(OUT_FILE, "w", encoding="utf-8") as f:
    f.write("userid,features\n")
    for uid in user_ids:
        feats_str = "|".join(user_profiles[uid])
        f.write(f"{uid},{feats_str}\n")

print("Profiles saved to", OUT_FILE)
#kldkl

Profiles saved to Group2_Part1_Profile11.csv


In [6]:
# build_profiles_part2.py
HOTELS_FILE = "hotels_processed.csv"
USERS_FILE  = "UserData.csv"
OUT_FILE    = "Group2_Part2_Profile21.csv"   # change Group1 -> your group number

# ---------- helpers ----------
def split_features(s):
    if not s:
        return []
    parts = s.split("|")
    # de-duplicate within one item
    uniq = []
    seen = {}
    for p in parts:
        p = p.strip()
        if p and (p not in seen):
            seen[p] = True
            uniq.append(p)
    return uniq

# ---------- 1) DF per feature ----------
feature_df = {}      # feature -> document frequency
item_features = {}   # itemid -> list of unique features
N_items = 0

with open(HOTELS_FILE, "r", encoding="utf-8") as f:
    header = f.readline()  # skip header line
    for line in f:
        cols = line.rstrip("\n").split(",")
        if len(cols) < 4:
            continue
        try:
            iid = int(cols[0])
        except:
            continue
        feats = split_features(cols[3])
        item_features[iid] = feats
        N_items += 1
        # update DF (count each feature once per item)
        seen_local = {}
        for ft in feats:
            if ft not in seen_local:
                seen_local[ft] = True
                feature_df[ft] = feature_df.get(ft, 0) + 1

# ---------- 2) IDF ----------
idf = {}  # feature -> idf weight (N_items / df)
for ft in feature_df:
    df = feature_df[ft]
    if df <= 0:
        idf[ft] = 0.0
    else:
        # simple inverse frequency (no log, no imports)
        idf[ft] = float(N_items) / float(df)

# ---------- 3) Item vectors (binary TF × IDF) ----------
item_vector = {}  # itemid -> dict(feature -> weight)
for iid in item_features:
    feats = item_features[iid]
    vec = {}
    for ft in feats:
        vec[ft] = idf.get(ft, 0.0)  # TF is 1 for present features
    item_vector[iid] = vec

# ---------- 4) Load user visits; pick first 5 users ----------
user2items = {}  # userid -> list of visited iids
with open(USERS_FILE, "r", encoding="utf-8") as f:
    header = f.readline().strip().split(",")
    # try to locate columns robustly
    # expect 'userid' and 'itemid' or ' itemid' depending on file
    try:
        uid_idx = header.index("userid")
    except:
        uid_idx = 0
    # find the itemid-ish column
    iid_idx = -1
    for i in range(len(header)):
        h = header[i].strip().lower()
        if "itemid" in h:
            iid_idx = i
            break
    if iid_idx == -1:
        iid_idx = 1  # fallback

    for line in f:
        row = line.strip().split(",")
        if len(row) <= iid_idx:
            continue
        try:
            uid = int(row[uid_idx])
            iid = int(row[iid_idx])
        except:
            continue
        if uid not in user2items:
            user2items[uid] = []
        # avoid duplicates
        if iid not in user2items[uid]:
            user2items[uid].append(iid)

# choose first 5 user IDs that appear
all_user_ids = sorted(user2items.keys())
user_ids = all_user_ids[:5]

# ---------- 5) Build centroid profile per user ----------
user_profile = {}  # userid -> dict(feature -> avg weight)

for uid in user_ids:
    visited = user2items.get(uid, [])
    count = 0
    acc = {}  # feature -> sum of weights across visited items
    for iid in visited:
        vec = item_vector.get(iid)
        if not vec:
            continue
        count += 1
        # add vector
        for ft in vec:
            acc[ft] = acc.get(ft, 0.0) + vec[ft]

    # average
    prof = {}
    if count > 0:
        for ft in acc:
            prof[ft] = acc[ft] / float(count)
    # store
    user_profile[uid] = prof

# ---------- 6) Write Part II profiles ----------
with open(OUT_FILE, "w", encoding="utf-8") as f:
    f.write("userid,features\n")
    for uid in user_ids:
        prof = user_profile.get(uid, {})
        # turn into token:weight, sorted by descending weight then token
        items = []
        # simple list + manual sort (bubble) to avoid imports
        for ft in prof:
            items.append((ft, prof[ft]))

        # sort by (-weight, token)
        n = len(items)
        for i in range(n - 1):
            for j in range(n - 1 - i):
                ft1, w1 = items[j]
                ft2, w2 = items[j + 1]
                swap = False
                if w2 > w1:
                    swap = True
                elif w2 == w1 and ft2 < ft1:
                    swap = True
                if swap:
                    tmp = items[j]
                    items[j] = items[j + 1]
                    items[j + 1] = tmp

        # format: token:weight
        parts = []
        for (ft, w) in items:
            parts.append(ft + ":" + ("{:.6f}".format(w)))
        f.write(str(uid) + "," + "|".join(parts) + "\n")

print("Part II profiles saved to", OUT_FILE)

Part II profiles saved to Group2_Part2_Profile21.csv


In [7]:
# build_model_part2.py
HOTELS_FILE   = "hotels_processed.csv"
USERS_FILE    = "UserData.csv"
PROFILES_FILE = "Group2_Part2_Profile21.csv"   # change Group1 -> your group number
OUT_FILE      = "Group2_Part2_Model22.csv"     # change Group1 -> your group number

# -------- helpers --------
def split_features(s):
    if not s:
        return []
    parts = s.split("|")
    uniq = []
    seen = {}
    for p in parts:
        p = p.strip()
        if p and (p not in seen):
            seen[p] = True
            uniq.append(p)
    return uniq

def parse_weighted_features(s):
    # "token:weight|token:weight|..."
    feats = {}
    if not s:
        return feats
    parts = s.split("|")
    for p in parts:
        p = p.strip()
        if not p:
            continue
        # split on last ":" to be robust if token ever contains ":"
        idx = p.rfind(":")
        if idx <= 0:
            continue
        token = p[:idx]
        w_str = p[idx+1:]
        try:
            w = float(w_str)
        except:
            w = 0.0
        feats[token] = w
    return feats

def dot(a, b):
    # a, b are dict(token -> weight)
    s = 0.0
    # iterate smaller dict for efficiency
    if len(a) > len(b):
        small = b; large = a
    else:
        small = a; large = b
    for k in small:
        if k in large:
            s += small[k] * large[k]
    return s

def norm_sq(a):
    s = 0.0
    for k in a:
        s += a[k] * a[k]
    return s

def cosine(a, b):
    # handle zero norms
    na = norm_sq(a)
    nb = norm_sq(b)
    if na <= 0.0 or nb <= 0.0:
        return 0.0
    return dot(a, b) / ((na ** 0.5) * (nb ** 0.5))

# -------- 1) Build DF for features & collect item features --------
feature_df = {}     # feature -> number of items containing it
item_features = {}  # itemid -> list of unique features
N_items = 0

with open(HOTELS_FILE, "r", encoding="utf-8") as f:
    f.readline()  # skip header
    for line in f:
        cols = line.rstrip("\n").split(",")
        if len(cols) < 4:
            continue
        try:
            iid = int(cols[0])
        except:
            continue
        feats = split_features(cols[3])
        item_features[iid] = feats
        N_items += 1
        seen_local = {}
        for ft in feats:
            if ft not in seen_local:
                seen_local[ft] = True
                feature_df[ft] = feature_df.get(ft, 0) + 1

# -------- 2) IDF and item vectors (TF is binary) --------
idf = {}  # feature -> N_items / df
for ft in feature_df:
    df = feature_df[ft]
    if df <= 0:
        idf[ft] = 0.0
    else:
        idf[ft] = float(N_items) / float(df)

item_vector = {}  # itemid -> dict(feature -> weight)
for iid in item_features:
    feats = item_features[iid]
    vec = {}
    for ft in feats:
        vec[ft] = idf.get(ft, 0.0)
    item_vector[iid] = vec

# -------- 3) Load visited items per user --------
user2items = {}  # userid -> list of visited itemids
with open(USERS_FILE, "r", encoding="utf-8") as f:
    header = f.readline().strip().split(",")
    # locate columns
    try:
        uid_idx = header.index("userid")
    except:
        uid_idx = 0
    iid_idx = -1
    for i in range(len(header)):
        if "itemid" in header[i].strip().lower():
            iid_idx = i
            break
    if iid_idx == -1:
        iid_idx = 1

    for line in f:
        row = line.strip().split(",")
        if len(row) <= iid_idx:
            continue
        try:
            uid = int(row[uid_idx])
            iid = int(row[iid_idx])
        except:
            continue
        if uid not in user2items:
            user2items[uid] = []
        if iid not in user2items[uid]:
            user2items[uid].append(iid)

# -------- 4) Load Part II user profiles (weighted) --------
user_profile = {}  # userid -> dict(feature -> weight)
user_ids = []

with open(PROFILES_FILE, "r", encoding="utf-8") as f:
    f.readline()  # skip header
    for line in f:
        # split only on first comma to keep feature string intact
        idx = line.find(",")
        if idx == -1:
            continue
        uid_str = line[:idx].strip()
        feats_str = line[idx+1:].strip()
        try:
            uid = int(uid_str)
        except:
            continue
        prof = parse_weighted_features(feats_str)
        user_profile[uid] = prof
        user_ids.append(uid)

# -------- 5) Build cosine similarity matrix & write CSV --------
# Columns are sorted itemids
all_items = []
for iid in item_vector:
    all_items.append(iid)
# manual sort
for i in range(len(all_items)-1):
    for j in range(len(all_items)-1-i):
        if all_items[j+1] < all_items[j]:
            tmp = all_items[j]
            all_items[j] = all_items[j+1]
            all_items[j+1] = tmp

with open(OUT_FILE, "w", encoding="utf-8") as f:
    # header row
    f.write("userid")
    for iid in all_items:
        f.write("," + str(iid))
    f.write("\n")

    for uid in user_ids:
        f.write(str(uid))
        visited = user2items.get(uid, [])
        prof = user_profile.get(uid, {})
        for iid in all_items:
            if iid in visited:
                # blank for visited items
                f.write(",")
            else:
                vec = item_vector.get(iid, {})
                sim = cosine(prof, vec)
                f.write("," + "{:.6f}".format(sim))
        f.write("\n")

print("Part II model (similarity matrix) saved to", OUT_FILE)

Part II model (similarity matrix) saved to Group2_Part2_Model22.csv


In [8]:
# top10_part2.py
HOTELS_FILE = "hotels_processed.csv"
MODEL_FILE  = "Group2_Part2_Model22.csv"     # change Group1 -> your group number
OUT_FILE    = "Group2_Part2_Recommendation23.csv"

# 1) Build lookup: itemid -> (hotelid, hotelname)
item_lookup = {}  # int -> (int, str)
with open(HOTELS_FILE, "r", encoding="utf-8") as f:
    header = f.readline().strip().split(",")  # itemid,hotelid,hotelname,features
    for line in f:
        parts = line.rstrip("\n").split(",")
        if len(parts) < 3:
            continue
        # parse ids safely
        try: 
            iid = int(parts[0])
        except:
            continue
        hid = 0
        if parts[1].strip() != "":
            try:
                hid = int(parts[1])
            except:
                hid = 0
        hname = parts[2].replace("\n", " ").replace("\r", " ")
        item_lookup[iid] = (hid, hname)

# 2) Read similarity matrix and compute Top-10 per user
recs = []  # (userid, itemid, hotelid, hotelname, similarity)

with open(MODEL_FILE, "r", encoding="utf-8") as f:
    # header: userid,<itemid1>,<itemid2>,...
    header = f.readline().strip().split(",")
    if len(header) <= 1:
        pass
    else:
        # parse item ids for columns
        item_ids = []
        for c in header[1:]:
            c = c.strip()
            try:
                item_ids.append(int(c))
            except:
                item_ids.append(None)  # keep alignment

        for line in f:
            if not line.strip():
                continue
            row = line.rstrip("\n").split(",")
            # first col is userid
            try:
                uid = int(row[0].strip())
            except:
                continue

            # collect (score, itemid) for non-empty cells
            scored = []
            for i in range(1, len(row)):
                iid = item_ids[i - 1]
                if iid is None:
                    continue
                cell = row[i].strip()
                if cell == "":
                    # blank means visited in our pipeline -> skip
                    continue
                # parse similarity float
                try:
                    s = float(cell)
                except:
                    continue
                scored.append((s, iid))

            # sort scored by similarity desc, then itemid asc — manual bubble sort (no imports)
            n = len(scored)
            for a in range(n - 1):
                for b in range(n - 1 - a):
                    s1, id1 = scored[b]
                    s2, id2 = scored[b + 1]
                    swap = False
                    if s2 > s1:
                        swap = True
                    elif s2 == s1 and id2 < id1:
                        swap = True
                    if swap:
                        tmp = scored[b]
                        scored[b] = scored[b + 1]
                        scored[b + 1] = tmp

            # take Top-10
            limit = 10 if n >= 10 else n
            for t in range(limit):
                s, iid = scored[t]
                hid, hname = item_lookup.get(iid, (0, ""))
                recs.append((uid, iid, hid, hname, s))

# 3) Write output CSV
with open(OUT_FILE, "w", encoding="utf-8") as f:
    f.write("userid,itemid,hotelid,hotelname,similarity\n")
    for (uid, iid, hid, hname, s) in recs:
        # basic cleaning in case hotelname has commas or newlines already removed
        name = hname.replace("\n", " ").replace("\r", " ")
        f.write(str(uid)); f.write(",")
        f.write(str(iid)); f.write(",")
        f.write(str(hid)); f.write(",")
        f.write(name); f.write(",")
        f.write("{:.6f}".format(s)); f.write("\n")

print("Top-10 recommendations saved to", OUT_FILE)

Top-10 recommendations saved to Group2_Part2_Recommendation23.csv
