# Trust Profile as seed

In [1]:
import pandas as pd
import networkx as nx
# Read trusted profiles and create a set of trusted usernames
trusted_profiles_df = pd.read_csv('d:/falconx/datingappdatascrap/dissimilarity/trusted_profiles_info.csv', index_col=0)
trusted_usernames = set(trusted_profiles_df['username'])
len(trusted_usernames)

123

In [5]:
# Read user friends data and build a graph
user_friends_df = pd.read_csv('d:/falconx/datingappdatascrap//normal/user_friends.csv')
G = nx.from_pandas_edgelist(user_friends_df, 'username', 'friend_username')
G.number_of_nodes()

# Initialize trust scores
initial_trust_scores = {username: 1 if username in trusted_usernames else 0 for username in G.nodes}

# Propagate trust scores using PageRank or a similar algorithm
# Here we use a damping factor of 0.85 for the PageRank algorithm
trust_scores = nx.pagerank(G, personalization=initial_trust_scores, alpha=0.9)
trust_scores_df = pd.DataFrame.from_dict(trust_scores, orient='index').reset_index()
trust_scores_df.columns = ['username', 'trust_score_pcc']
trust_scores_df['username'] = trust_scores_df['username'].apply(lambda x: x.lower())
trust_scores_df.to_csv('trust_scores.csv', index=False)

In [6]:
trust_scores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3386 entries, 0 to 3385
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   username         3386 non-null   object 
 1   trust_score_pcc  3386 non-null   float64
dtypes: float64(1), object(1)
memory usage: 53.0+ KB


# Page Rank as Seed

In [7]:
pagerank_profiles_df = pd.read_csv('d:/falconx/datingappdatascrap/normal/pagerank.csv')
pagerank_profiles_df

Unnamed: 0,User,PageRank Score
0,user_78737,0.004997
1,BSharp1013,0.004325
2,Kennethrw,0.003571
3,Louis_M,0.003539
4,jannyN12,0.003300
...,...,...
3381,pionerka,0.000076
3382,johuelj,0.000076
3383,Dl,0.000076
3384,deraj,0.000076


In [8]:
pagerankseed = pagerank_profiles_df[:123]
pagerankseed = set(pagerankseed['User'])
len(pagerankseed)

123

In [9]:
# new graph
GP = nx.from_pandas_edgelist(user_friends_df, 'username', 'friend_username')
GP.number_of_nodes()

# reset page scores
reset_trust_scores = {username: 1 if username in pagerankseed else 0 for username in GP.nodes}

# Step 4: Propagate trust scores using PageRank or a similar algorithm
# Here we use a damping factor of 0.85 for the PageRank algorithm
pr_trust_scores = nx.pagerank(G, personalization=reset_trust_scores, alpha=0.9)
pr_trust_scores_df = pd.DataFrame.from_dict(pr_trust_scores, orient='index').reset_index()
pr_trust_scores_df.columns = ['username', 'trust_score_pr']
pr_trust_scores_df['username'] = pr_trust_scores_df['username'].apply(lambda x: x.lower())
pr_trust_scores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3386 entries, 0 to 3385
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   username        3386 non-null   object 
 1   trust_score_pr  3386 non-null   float64
dtypes: float64(1), object(1)
memory usage: 53.0+ KB


# Average and output score

In [11]:
trust_rank_df_2in1 = pd.merge(trust_scores_df, pr_trust_scores_df, on='username', how='inner')
trust_rank_df_2in1['trust_score'] = (trust_rank_df_2in1['trust_score_pcc'] + trust_rank_df_2in1['trust_score_pr'])/2
trust_rank_df_2in1.head()

Unnamed: 0,username,trust_score_pcc,trust_score_pr,trust_score
0,ausguy2013,0.001849,0.004827,0.003338
1,letizia,5.7e-05,0.00015,0.000104
2,tt,5.7e-05,0.00015,0.000104
3,constanza28,0.002723,0.003851,0.003287
4,carol39,5.7e-05,0.00015,0.000104


In [12]:
username_to_index_df = pd.read_csv('d:/falconx/datingappdatascrap/dissimilarity/processed_data.csv', index_col=0)
# username_to_index_df = username_to_index_df.dropna(subset = ['username'])
username_to_index_df['username'] = username_to_index_df['username'].apply(lambda x: x.lower())
username_to_index_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56984 entries, 0 to 56983
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0.2          56984 non-null  int64  
 1   Unnamed: 0.1          56984 non-null  int64  
 2   Unnamed: 0            56984 non-null  int64  
 3   scam                  56984 non-null  int64  
 4   username              56984 non-null  object 
 5   age                   56980 non-null  float64
 6   gender                56984 non-null  object 
 7   location              55956 non-null  object 
 8   ethnicity             52661 non-null  object 
 9   occupation            34584 non-null  object 
 10  marital_status        55041 non-null  object 
 11  children              51452 non-null  object 
 12  religion              51112 non-null  object 
 13  sexual_orientation    51451 non-null  object 
 14  sex                   56984 non-null  object 
 15  description           12

In [14]:
trust_rank_profiles = pd.merge(trust_rank_df_2in1, username_to_index_df, on='username', how = 'left')
trust_rank_profiles = trust_rank_profiles.sort_values(by='trust_score', ascending=False)
trust_rank_profiles.to_csv('trust_rank_profiles.csv', index=False)

In [15]:
trust_rank_samples = trust_rank_profiles.dropna(subset = ['sex'])
trust_rank_samples = trust_rank_samples[:500]
trust_rank_samples.to_csv('trust_rank_samples.csv', index=False)

# Normalize

In [18]:
# current max and min
min_score = trust_rank_profiles['trust_score'].min()
max_score = trust_rank_profiles['trust_score'].max()

trust_rank_profiles['scaled_trust_score'] = trust_rank_profiles['trust_score'].apply(
    lambda x: x / max_score * 10
)

trust_rank_profiles.to_csv('trust_rank_profiles.csv', index=False)
