## Reading real profile jsons into df

In [3]:
import json
import os
import glob
import pandas as pd


In [None]:

json_dir = 'data/realprofile'
json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

dfs = []
for file in file_list:
    with open(file) as f:
        json_data = pd.json_normalize(json.loads(f.read()))
        json_data['site'] = file.rsplit("/", 1)[-1]
    dfs.append(json_data)
realdf = pd.concat(dfs)
realdf.to_csv('uncleaned_real_profile.csv', header=True, index=False)

In [5]:
realdf = pd.read_csv("../uncleaned_real_profile.csv")

## Reading scam profile jsons into df

In [None]:
json_dir = 'data/scamprofile'
json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

dfs = []
for file in file_list:
    with open(file) as f:
        json_data = pd.json_normalize(json.loads(f.read()))
        json_data['site'] = file.rsplit("/", 1)[-1]
    dfs.append(json_data)
scamdf = pd.concat(dfs)

scamdf.to_csv('uncleaned_scam_profile.csv', header=True, index=False)

In [6]:
scamdf = pd.read_csv("../uncleaned_scam_profile.csv")

# Scam df data cleaning 
- Select only relevant columns: username, age, occupation, status, gender, description
- keep only the 4255 rows with non null description
- For description column, 
    - remove the \n at the beginning of the sentence
    - remove punctuations
    - remove stop words
    - Use TF-IDF vectorizer to encode the descriptions (actually based on the research paper, they trained an SVM       algorithm (linear kernel) as implemented in LibShortText, an open-source software package for short-         text classification and analysis)
    - Text vectorization: https://www.oreilly.com/library/view/applied-text-analysis/9781491963036/ch04.html
    
    

In [7]:
pd.set_option("display.max_colwidth", None)

In [8]:
#select relevant columns
scamdf1 = scamdf[["username","age","occupation","status","gender","description"]].copy()
len(scamdf1)

4342

In [9]:
#drop all rows with no descriptions - 87 rows dropped
scamdf1 = scamdf1[scamdf1["description"].notna()]
len(scamdf1)

4255

In [10]:
#remove punctuation
import re
def get_new_sentence(x):
    return re.sub(r'[^\w\s]', '', x).strip()

scamdf1["description"] = scamdf1["description"].apply(lambda x: get_new_sentence(x))

#clean age column
def get_age(x):
    x = str(x)
    if 'or' in x:
        return x.split()[0]
    else:
        return x

scamdf1["age"] = scamdf1["age"].apply(lambda x: get_age(x))

In [11]:
#removing punctuations from age and gender columns
scamdf1["gender"] = scamdf1["gender"].apply(lambda x: get_new_sentence(x))
scamdf1["age"] = scamdf1["age"].apply(lambda x: get_new_sentence(x))

In [12]:
#remove stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stop_words(description):
    
    stop_words = set(stopwords.words('english'))

    word_tokens = word_tokenize(description)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    return ' '.join(filtered_sentence)

#create a new column to store the filtered sentences
scamdf1["filtered sentence"] = scamdf1["description"].apply(lambda x: remove_stop_words(x))

In [13]:
scamdf1["y"] = 1 #1 for scam profiles
scamdf1.head()

Unnamed: 0,username,age,occupation,status,gender,description,filtered sentence,y
0,000love,33,student,single,female,i am cool simple easy going easy to please if treated with resp ect i am cool simple easy going easy to please if treated with resp ecti am cool simple easy going easy to please if treated with resp ect,cool simple easy going easy please treated resp ect cool simple easy going easy please treated resp ecti cool simple easy going easy please treated resp ect,1
1,05johnsmith,57,contractor,widowed,male,i am kind honest love and caring man my hobby are music and sport i believe in god,kind honest love caring man hobby music sport believe god,1
2,1234,57,business,widowed,male,As you know that its myriads that someone has lots of interested women in here but its my choice to choose onethe one my heart goes forsomeone i can lean onto give me the love i have always wanted to haveSo tell me about yourselfwhat you do and everything you need to tell me about yourselfand lets see how it goes from hereBut anywayHeres a little info about me I like to go out to dinner go dancinggo to the movies workout travel see new things I enjoy the simple things like the ocean cuddling at home watching a movie with someone special Im a extremely romantic passionate and affectionate woman,As know myriads someone lots interested women choice choose onethe one heart goes forsomeone lean onto give love always wanted haveSo tell yourselfwhat everything need tell yourselfand lets see goes hereBut anywayHeres little info I like go dinner go dancinggo movies workout travel see new things I enjoy simple things like ocean cuddling home watching movie someone special Im extremely romantic passionate affectionate woman,1
3,231411,53,Building Contractor,single,male,I am looking for a woman whom I think is handsome sexy and smart A perfect simple woman one who treats me well and who makes me laugh she is not afraid to grab my hand and steal a kiss in public or to just shoot a knowing stare across the room that tells me she is admiring me from afarAt this stage in my life I am focused on finding you I have a great deal to offer and will always provide a warm heart and welcoming smile I am optimistic youre out there and perhaps ifwhen you read this far in my profile it may resonate with you\nBe smart and contact me directly henshawmark is my ID and to figure the network you ddd the first alphabet of the following word in capital Yoke Apple Hot Orange Orange Let see if you solve it by writing me Had to sneak this in lol,I looking woman I think handsome sexy smart A perfect simple woman one treats well makes laugh afraid grab hand steal kiss public shoot knowing stare across room tells admiring afarAt stage life I focused finding I great deal offer always provide warm heart welcoming smile I optimistic youre perhaps ifwhen read far profile may resonate Be smart contact directly henshawmark ID figure network ddd first alphabet following word capital Yoke Apple Hot Orange Orange Let see solve writing Had sneak lol,1
4,3234714190Txtme,33,self employed,single,female,Just a note I used to be able to do a name search for people using the Meet Me feature I can no longer do that so please send a message instead or at least hit Favorite so I can find you\nI have a really good sense of humor quickthinking wit Very kindhearted and sometimes maybe too logical I love to travel have game nights bar b ques with friends and family I only drink socially have a good group of family and friends who when we do get together laugh a lot I love being active and in nature ride bikes exploring new places I love old school rock and some older country and wherever I am if I know the song playing I may accidentally start singing out loud in front of other people hopefully youll join me I cry every time I hear The National Anthem Im a simple girl and try and be as stressfree as possible Ive never been more selfaware and selfconfident in my life Very happy and excited about my future in so many ways Work love the possibilities are endless I love love a great smile on someone A good kisser is a must,Just note I used able name search people using Meet Me feature I longer please send message instead least hit Favorite I find I really good sense humor quickthinking wit Very kindhearted sometimes maybe logical I love travel game nights bar b ques friends family I drink socially good group family friends get together laugh lot I love active nature ride bikes exploring new places I love old school rock older country wherever I I know song playing I may accidentally start singing loud front people hopefully youll join I cry every time I hear The National Anthem Im simple girl try stressfree possible Ive never selfaware selfconfident life Very happy excited future many ways Work love possibilities endless I love love great smile someone A good kisser must,1


In [14]:
scamdf1.drop(columns = "description", inplace = True)

In [15]:
scamdf1.head()

Unnamed: 0,username,age,occupation,status,gender,filtered sentence,y
0,000love,33,student,single,female,cool simple easy going easy please treated resp ect cool simple easy going easy please treated resp ecti cool simple easy going easy please treated resp ect,1
1,05johnsmith,57,contractor,widowed,male,kind honest love caring man hobby music sport believe god,1
2,1234,57,business,widowed,male,As know myriads someone lots interested women choice choose onethe one heart goes forsomeone lean onto give love always wanted haveSo tell yourselfwhat everything need tell yourselfand lets see goes hereBut anywayHeres little info I like go dinner go dancinggo movies workout travel see new things I enjoy simple things like ocean cuddling home watching movie someone special Im extremely romantic passionate affectionate woman,1
3,231411,53,Building Contractor,single,male,I looking woman I think handsome sexy smart A perfect simple woman one treats well makes laugh afraid grab hand steal kiss public shoot knowing stare across room tells admiring afarAt stage life I focused finding I great deal offer always provide warm heart welcoming smile I optimistic youre perhaps ifwhen read far profile may resonate Be smart contact directly henshawmark ID figure network ddd first alphabet following word capital Yoke Apple Hot Orange Orange Let see solve writing Had sneak lol,1
4,3234714190Txtme,33,self employed,single,female,Just note I used able name search people using Meet Me feature I longer please send message instead least hit Favorite I find I really good sense humor quickthinking wit Very kindhearted sometimes maybe logical I love travel game nights bar b ques friends family I drink socially good group family friends get together laugh lot I love active nature ride bikes exploring new places I love old school rock older country wherever I I know song playing I may accidentally start singing loud front people hopefully youll join I cry every time I hear The National Anthem Im simple girl try stressfree possible Ive never selfaware selfconfident life Very happy excited future many ways Work love possibilities endless I love love great smile someone A good kisser must,1


In [16]:
# check for rows that have no descriptions
scamdf1[scamdf1["filtered sentence"] == '']

Unnamed: 0,username,age,occupation,status,gender,filtered sentence,y
20,adalhardbartholo,42,Oil and Gas consultant,single,male,,1
26,adamsandler,53,doctor,single,male,,1
30,adamslilly,51,laywer,widowed,female,,1
31,adamsmorgan,46,manufacturing,single,male,,1
39,Adriana200,30,self employed,single,female,,1
...,...,...,...,...,...,...,...
4297,wilsonmorgan,48,engineer,widowed,male,,1
4314,wyanebruce,42,–,–,male,,1
4319,w_paul67,38,–,–,male,,1
4329,yolly36,36,military,single,female,,1


In [17]:
# remove rows that have no descriptions
scamdf1 = scamdf1[scamdf1["filtered sentence"] != ''].copy()

In [18]:
#remove rows that are not in english 
from langdetect import detect
scamdf1['Language'] = scamdf1['filtered sentence'].apply(detect)
scamdf1 = scamdf1[scamdf1['Language']=='en'].copy()

In [19]:
# drop language column
scamdf1.drop(columns='Language', inplace=True)

In [20]:
# removing occupation value count ==1 
scamdf1['occupation'].value_counts()

temp=scamdf1['occupation'].tolist()
scamdf1 = scamdf1[scamdf1['occupation'].apply(lambda x: temp.count(x)>1)].copy()

In [21]:
# removing status value count ==1 
scamdf1['status'].value_counts()

tempstatus=scamdf1['status'].tolist()
scamdf1 = scamdf1[scamdf1['status'].apply(lambda x: tempstatus.count(x)>1)].copy()

In [22]:
# check value counts for gender column
scamdf1['gender'].value_counts()

male      1816
female    1144
Name: gender, dtype: int64

In [23]:
# check null values
scamdf1.isna().sum()

username             0
age                  0
occupation           4
status               2
gender               0
filtered sentence    0
y                    0
dtype: int64

In [24]:
# drop null rows
scamdf1.dropna(inplace=True)

In [25]:
scamdf1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2954 entries, 0 to 4341
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   username           2954 non-null   object
 1   age                2954 non-null   object
 2   occupation         2954 non-null   object
 3   status             2954 non-null   object
 4   gender             2954 non-null   object
 5   filtered sentence  2954 non-null   object
 6   y                  2954 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 184.6+ KB


In [26]:
# final 2959 rows
scamdf1.to_csv('scam_profile.csv', header=True, index=False)

# Real df data cleaning
- Select only relevant columns - username, gender, age, occupation, status, description
- For description, 
    - if description == '-':
        - description = ethnicity + children + orientation + religion + smoking + drinking + intent
    - remove punctuations 
    - check how many non english descriptions 

In [27]:
realdf.head()

Unnamed: 0,gender,age,location,status,username,ethnicity,occupation,description,match_age,children,orientation,religion,smoking,drinking,intent,site
0,female,39 y.o.,"Johannesburg, South Africa",single,00,black,Frailcare nurse,-,from 36 to 50,1-2 living with me,Straight,Christian,non-smoker,social drinker,Romance,realprofile\real00.json
1,female,43 y.o.,"Manizales, Caldas, Colombia",separated,0065patricia,native american,Administradora,-,from 19 to 86,more than 2 living elsewhere,Bisexual,Other,non-smoker,never,Serious Relationship,realprofile\real0065patricia.json
2,male,63 y.o.,"Lima, Perú",divorced,007,white,comercio,-,from 19 to 86,1-2 living elsewhere,Straight,Christian,non-smoker,occasional drinker,Friendship,realprofile\real007.json
3,male,59 y.o.,"London, UK",divorced,0077654321,white,Developer,-,from 48 to 65,1-2 living elsewhere,Straight,Atheist,non-smoker,occasional drinker,"Fun, Friendship, Romance, Serious Relationship",realprofile\real0077654321.json
4,male,30 y.o.,"Sucre, Bolivia",single,00alex00,hispanic,Guardia de seguridad,-,from 19 to 86,don't want children,Straight,Other,non-smoker,never,"Fun, Friendship, Romance",realprofile\real00alex00.json


In [28]:
realdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17724 entries, 0 to 17723
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   gender       17724 non-null  object
 1   age          17724 non-null  object
 2   location     17724 non-null  object
 3   status       17724 non-null  object
 4   username     17723 non-null  object
 5   ethnicity    17724 non-null  object
 6   occupation   17724 non-null  object
 7   description  17724 non-null  object
 8   match_age    17724 non-null  object
 9   children     17724 non-null  object
 10  orientation  17724 non-null  object
 11  religion     17724 non-null  object
 12  smoking      17724 non-null  object
 13  drinking     17724 non-null  object
 14  intent       17724 non-null  object
 15  site         17724 non-null  object
dtypes: object(16)
memory usage: 2.2+ MB


In [29]:
realdf2 = realdf[realdf["description"] == "-"].copy()

In [30]:
realdf2["description"] = realdf2["ethnicity"] +' '+ realdf2["orientation"] +' '+ realdf2["religion"] +' '+ realdf2["smoking"] +' '+ realdf2["drinking"] +' '+ realdf2["intent"]
realdf2.head(10)


Unnamed: 0,gender,age,location,status,username,ethnicity,occupation,description,match_age,children,orientation,religion,smoking,drinking,intent,site
0,female,39 y.o.,"Johannesburg, South Africa",single,00,black,Frailcare nurse,black Straight Christian non-smoker social drinker Romance,from 36 to 50,1-2 living with me,Straight,Christian,non-smoker,social drinker,Romance,realprofile\real00.json
1,female,43 y.o.,"Manizales, Caldas, Colombia",separated,0065patricia,native american,Administradora,native american Bisexual Other non-smoker never Serious Relationship,from 19 to 86,more than 2 living elsewhere,Bisexual,Other,non-smoker,never,Serious Relationship,realprofile\real0065patricia.json
2,male,63 y.o.,"Lima, Perú",divorced,007,white,comercio,white Straight Christian non-smoker occasional drinker Friendship,from 19 to 86,1-2 living elsewhere,Straight,Christian,non-smoker,occasional drinker,Friendship,realprofile\real007.json
3,male,59 y.o.,"London, UK",divorced,0077654321,white,Developer,"white Straight Atheist non-smoker occasional drinker Fun, Friendship, Romance, Serious Relationship",from 48 to 65,1-2 living elsewhere,Straight,Atheist,non-smoker,occasional drinker,"Fun, Friendship, Romance, Serious Relationship",realprofile\real0077654321.json
4,male,30 y.o.,"Sucre, Bolivia",single,00alex00,hispanic,Guardia de seguridad,"hispanic Straight Other non-smoker never Fun, Friendship, Romance",from 19 to 86,don't want children,Straight,Other,non-smoker,never,"Fun, Friendship, Romance",realprofile\real00alex00.json
5,male,32 y.o.,"Colinas de Cerro Viento, Panamá",single,00lucero90,hispanic,Ingeniero naval,"hispanic Straight Christian social smoker social drinker Fun, Friendship, Romance",from 19 to 86,no children,Straight,Christian,social smoker,social drinker,"Fun, Friendship, Romance",realprofile\real00lucero90.json
7,male,52 y.o.,"Bucaramanga, Santander, Colombia",divorced,01Anselmo212,hispanic,Ingeniero,hispanic Straight Christian non-smoker never Serious Relationship,from 30 to 40,1-2 living elsewhere,Straight,Christian,non-smoker,never,Serious Relationship,realprofile\real01Anselmo212.json
8,male,49 y.o.,"Miami, Florida, EE. UU.",divorced,0279602,hispanic,Mantenimiento,hispanic Straight Other non-smoker never Serious Relationship,from 33 to 50,1-2 living elsewhere,Straight,Other,non-smoker,never,Serious Relationship,realprofile\real0279602.json
10,male,40 y.o.,"Chicago, IL, USA",single,03Robert,hispanic,-,"hispanic Straight Other light smoker social drinker Friendship, Romance, Serious Relationship",from 28 to 41,want children,Straight,Other,light smoker,social drinker,"Friendship, Romance, Serious Relationship",realprofile\real03Robert.json
11,male,49 y.o.,"Arequipa, Perú",single,06Wil09,hispanic,Docente,"hispanic Straight Christian non-smoker never Fun, Romance, Marriage",from 21 to 40,no children,Straight,Christian,non-smoker,never,"Fun, Romance, Marriage",realprofile\real06Wil09.json


In [31]:
#combine realdf2 with realdf3 
realdf3 = realdf[realdf["description"] != '-']
realdf4 = pd.concat([realdf3, realdf2])
len(realdf4)

17724

In [32]:
from langdetect import detect
realdf4['Language'] = realdf4['description'].apply(detect)


In [33]:
realdf4.head()

Unnamed: 0,gender,age,location,status,username,ethnicity,occupation,description,match_age,children,orientation,religion,smoking,drinking,intent,site,Language
6,male,58 y.o.,"San Salvador, El Salvador",divorced,0119162866,hispanic,Negocio,"Honesto. Trabajador, divertido. Me gusta la naturaleza, viajar, bailar.",from 30 to 42,no children,Straight,Spiritual,social smoker,never,Friendship,realprofile\real0119162866.json,es
9,male,38 y.o.,"Boaco, Nicaragua",single,02erick,hispanic,Decoraciones eventos privados,Soy un hombre sociable cariñoso y me gusta conoser personas y tener la oportunidad de establecer una relación en algún momento,from 24 to 39,want children,Straight,Christian,social smoker,occasional drinker,"Fun, Friendship, Romance, Serious Relationship",realprofile\real02erick.json,es
18,male,35 y.o.,"Bogota, Colombia",single,1047373361es,native american,Electrcista,Honesta honrrada y trabajadora y muy alegre amigable,from 30 to 50,no children,Straight,Other,non-smoker,never,"Serious Relationship, Marriage",realprofile\real1047373361es.json,es
22,female,40 y.o.,"Machala, Ecuador",single,12345bonita,black,-,Soy negra mido metro 150y soy gordita,from 19 to 86,more than 2 living with me,Bisexual,Christian,non-smoker,never,Serious Relationship,realprofile\real12345bonita.json,es
26,male,66 y.o.,"San Jose, CA, USA",single,123canwe,white,Retired,I am full of fire and friskier than hell!!!!,from 19 to 64,don't want children,Straight,Christian,non-smoker,never,"Fun, Friendship, Romance, Serious Relationship",realprofile\real123canwe.json,en


In [34]:
#removing rows that description is not english
realdf5 = realdf4[realdf4["Language"] == 'en'].copy()

In [36]:
#select relevant columns
realdf = realdf5[["username","age","occupation","status","gender","description"]].copy()

#remove 'y.o' from age
def get_age(x):
    return x.split('y')[0]
     
realdf["age"] = realdf["age"].apply(lambda x: get_age(x))

#remove punctuation
import re
def get_new_sentence(x):
    return re.sub(r'[^\w\s]', '', x).strip()

realdf["description"] = realdf["description"].apply(lambda x: get_new_sentence(x))

In [37]:
#removing punctuations from age, status and gender columns
realdf["gender"] = realdf["gender"].apply(lambda x: get_new_sentence(x))
realdf["age"] = realdf["age"].apply(lambda x: get_new_sentence(x))
realdf["status"] = realdf["status"].apply(lambda x: get_new_sentence(x))

In [38]:
#remove stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stop_words(description):
    
    stop_words = set(stopwords.words('english'))

    word_tokens = word_tokenize(description)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    return ' '.join(filtered_sentence)

#create a new column to store the filtered sentences
realdf["filtered sentence"] = realdf["description"].apply(lambda x: remove_stop_words(x))

In [39]:
realdf.drop(columns = "description", inplace = True)

In [40]:
realdf["y"] = 0
realdf.head()

Unnamed: 0,username,age,occupation,status,gender,filtered sentence,y
26,123canwe,66,Retired,single,male,I full fire friskier hell,0
30,123WILFREDO,28,ESTUDIANTE,single,male,ME GUSTA CONOCER CHICAS BONDADOSAS CARIÑOSAS Y MUY FUERTES EN LA MORAL,0
44,1907,48,Construction,single,male,laid back earth good sense humor,0
59,1engineer,37,single engineer,single,male,I like travel watch movies read books sports different kind activities I passion life love among people people,0
63,1SweetChick,56,-,single,female,Im looking make friends Ive great personality I love happy laughing negative person I like spread joy And maybe relationship point Have great day,0


In [41]:
# removing occupation value count ==1 
realtemp=realdf['occupation'].tolist()
realdf=realdf[realdf['occupation'].apply(lambda x: realtemp.count(x)>1)].copy()

In [42]:
realdf['occupation'].value_counts()

-                      4894
Estudiante              164
Retired                 151
Independiente           151
Comerciante             148
                       ... 
contador publico          2
Terapista                 2
Diseñadora Gráfica        2
Casa                      2
Mechanical engineer       2
Name: occupation, Length: 789, dtype: int64

In [44]:
# check null occupation
realdf[realdf['occupation']=='-']

Unnamed: 0,username,age,occupation,status,gender,filtered sentence,y
63,1SweetChick,56,-,single,female,Im looking make friends Ive great personality I love happy laughing negative person I like spread joy And maybe relationship point Have great day,0
147,aaronb69,55,-,separated,male,Free thirtyn years hell want fun know,0
361,Adrimart,40,-,single,female,SOY ALEGRE SOCIABLE TRABAJADORA CARIÑOSA CARISMÁTICA,0
375,Adzman,40,-,single,male,Iam Adam single looking real love someone handle sex drive iam kind caring loving loyal,0
377,aelix,37,-,single,male,Never married No kids I love good morning smile goodnight kiss Looking real woman knows wants I NOT ONS thnx,0
...,...,...,...,...,...,...,...
17700,ZouZ961,37,-,single,male,white Straight Christian light smoker social drinker Serious Relationship,0
17710,Zuli,35,-,single,female,hispanic Straight Christian nonsmoker social drinker Friendship,0
17714,zuriaco,54,-,single,male,white Straight Christian nonsmoker never Romance Serious Relationship,0
17718,Zxc,50,-,single,male,white Straight Christian nonsmoker occasional drinker Serious Relationship,0


In [45]:
# removing null occupation
realdf = realdf[realdf["occupation"] != '-'].copy()
realdf

Unnamed: 0,username,age,occupation,status,gender,filtered sentence,y
26,123canwe,66,Retired,single,male,I full fire friskier hell,0
30,123WILFREDO,28,ESTUDIANTE,single,male,ME GUSTA CONOCER CHICAS BONDADOSAS CARIÑOSAS Y MUY FUERTES EN LA MORAL,0
44,1907,48,Construction,single,male,laid back earth good sense humor,0
105,52Jim52,70,Retired,divorced,male,Retired owner aerospace consuloting firm,0
146,Aaron90,28,Social worker,single,male,Hello aaron Vermont I vermont 2 years Im really fun person I like camping bone fire etc,0
...,...,...,...,...,...,...,...
17712,ZullynAzul2,25,Estudiante,single,female,white Straight Christian nonsmoker never Serious Relationship,0
17713,zunilda,66,docente,single,female,hispanic Straight Other nonsmoker never Friendship,0
17717,Zwadiemarie22,41,Probation Officer,separated,female,black Straight Christian nonsmoker occasional drinker Serious Relationship Marriage,0
17719,Zxxxx,26,Profesora,single,female,hispanic Straight Christian nonsmoker never Friendship Romance,0


In [46]:
# checking value_counts for status
realdf['status'].value_counts()

single             3482
divorced           1040
separated           697
widowed             178
in relationship     110
married              75
Name: status, dtype: int64

In [47]:
# checking value_counts for gender
realdf['gender'].value_counts()

male      3821
female    1761
Name: gender, dtype: int64

In [48]:
realdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5582 entries, 26 to 17722
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   username           5582 non-null   object
 1   age                5582 non-null   object
 2   occupation         5582 non-null   object
 3   status             5582 non-null   object
 4   gender             5582 non-null   object
 5   filtered sentence  5582 non-null   object
 6   y                  5582 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 348.9+ KB


In [49]:
# final 5582 rows
realdf.to_csv('real_profile.csv', header=True, index=False)

# combined dataset

In [50]:
final_df = pd.concat([realdf,scamdf1])
final_df.head()

Unnamed: 0,username,age,occupation,status,gender,filtered sentence,y
26,123canwe,66,Retired,single,male,I full fire friskier hell,0
30,123WILFREDO,28,ESTUDIANTE,single,male,ME GUSTA CONOCER CHICAS BONDADOSAS CARIÑOSAS Y MUY FUERTES EN LA MORAL,0
44,1907,48,Construction,single,male,laid back earth good sense humor,0
105,52Jim52,70,Retired,divorced,male,Retired owner aerospace consuloting firm,0
146,Aaron90,28,Social worker,single,male,Hello aaron Vermont I vermont 2 years Im really fun person I like camping bone fire etc,0


In [53]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8536 entries, 26 to 4341
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   username           8536 non-null   object
 1   age                8536 non-null   object
 2   occupation         8536 non-null   object
 3   status             8536 non-null   object
 4   gender             8536 non-null   object
 5   filtered sentence  8536 non-null   object
 6   y                  8536 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 533.5+ KB


In [52]:
#read to csv file
final_df.to_csv('combined_profile.csv', header=True, index=False)

In [None]:
final_df[final_df["username"] == "Aigul0468"]

# 4. Distribution plots for categorical variables 

## 4.1 Age distribution of real profiles

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px

counts = realdf["Age"].value_counts(normalize = True)

fig = px.bar(counts, title="Age of real profiles")
fig.update_layout(
    xaxis_title = "Age",
    yaxis_title = "Number of profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px

counts = scamdf1["age"].value_counts(normalize = True)

fig = px.bar(counts, title="Age of scam profiles")
fig.update_layout(
    xaxis_title = "Age",
    yaxis_title = "Number of profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

## 4.1 Gender distribution 

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px

counts = realdf["gender"].value_counts(normalize = True)

fig = px.bar(counts, title="Gender distribution of real profiles")
fig.update_layout(
    xaxis_title = "Gender",
    yaxis_title = "Percentage of profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px

counts = scamdf["gender"].value_counts(normalize = True)

fig = px.bar(counts, title="Gender distribution of scam profiles")
fig.update_layout(
    xaxis_title = "Gender",
    yaxis_title = "Percentage of profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

## 4.6 Status

In [None]:
counts = realdf["status"].value_counts(normalize = True)

fig = px.bar(counts, title="Status of real profiles")
fig.update_layout(
    xaxis_title = "Status",
    yaxis_title = "Percentage of real profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

In [None]:
counts = scamdf["status"].value_counts(normalize = True)

fig = px.bar(counts, title="Status of scam profiles")
fig.update_layout(
    xaxis_title = "Status",
    yaxis_title = "Percentage of scam profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

In [None]:
counts = scamdf["occupation"].value_counts(normalize = True)

fig = px.bar(counts, title="Occupation of scam profiles")
fig.update_layout(
    xaxis_title = "Occupations",
    yaxis_title = "Percentage of scam profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

In [None]:
counts = realdf["occupation"].value_counts(normalize = True)

fig = px.bar(counts, title="Occupation of real profiles")
fig.update_layout(
    xaxis_title = "Occupations",
    yaxis_title = "Percentage of real profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

## Get Image name of selected scam profiles

In [None]:
import json
import os
import glob
import pandas as pd
import numpy as np

data = pd.read_csv("scam_profile.csv")
data

In [None]:
json_dir = 'data/scamprofile'
json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

dfs = []
for file in file_list:
    with open(file) as f:
        json_data = pd.json_normalize(json.loads(f.read()))
        json_data['site'] = file.rsplit("/", 1)[-1]
    dfs.append(json_data)
scamdf = pd.concat(dfs)

In [None]:
scamdf

In [None]:
selected = np.array(scamdf[scamdf['username'].isin(data['username'])]['images'])
pd.DataFrame(scamdf[scamdf['username'].isin(data['username'])]['images'].explode()).to_csv("selected_scam_images.csv",index=False)
# pd.read_csv("selected_scam_images.csv")

## Selected Real Profile Images

In [None]:
data1 = pd.read_csv("real_profile.csv")

pd.DataFrame(data1['username']+".jpg").to_csv("selected_real_images.csv",index=False)

In [None]:
pd.read_csv("selected_real_images.csv")