## Reading real profile jsons into df

In [1]:
import json
import os
import glob
import pandas as pd


In [None]:

json_dir = 'data/realprofile'
json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

dfs = []
for file in file_list:
    with open(file) as f:
        json_data = pd.json_normalize(json.loads(f.read()))
        json_data['site'] = file.rsplit("/", 1)[-1]
    dfs.append(json_data)
realdf = pd.concat(dfs)
realdf.to_csv('uncleaned_real_profile.csv', header=True, index=False)

In [178]:
realdf = pd.read_csv("../uncleaned_real_profile.csv")

## Reading scam profile jsons into df

In [None]:
json_dir = 'data/scamprofile'
json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

dfs = []
for file in file_list:
    with open(file) as f:
        json_data = pd.json_normalize(json.loads(f.read()))
        json_data['site'] = file.rsplit("/", 1)[-1]
    dfs.append(json_data)
scamdf = pd.concat(dfs)

scamdf.to_csv('uncleaned_scam_profile.csv', header=True, index=False)

In [319]:
scamdf = pd.read_csv("../uncleaned_scam_profile.csv")

# Scam df data cleaning 
- Select only relevant columns: username, age, occupation, status, gender, description
- keep only the 4255 rows with non null description
- For description column, 
    - remove the \n at the beginning of the sentence
    - remove punctuations
    - remove stop words
    - Use TF-IDF vectorizer to encode the descriptions (actually based on the research paper, they trained an SVM       algorithm (linear kernel) as implemented in LibShortText, an open-source software package for short-         text classification and analysis)
    - Text vectorization: https://www.oreilly.com/library/view/applied-text-analysis/9781491963036/ch04.html
    
    

In [320]:
pd.set_option("display.max_colwidth", None)

In [321]:
#select relevant columns
scamdf1 = scamdf[["username","age","occupation","status","gender","description"]].copy()
len(scamdf1)

4342

In [322]:
#drop all rows with no descriptions - 87 rows dropped
scamdf1 = scamdf1[scamdf1["description"].notna()]
len(scamdf1)

4255

In [323]:
#remove punctuation
import re
def get_new_sentence(x):
    return re.sub(r'[^\w\s]', '', str(x)).strip()

scamdf1["description"] = scamdf1["description"].apply(lambda x: get_new_sentence(x))

#clean age column
def get_age(x):
    x = str(x)
    if 'or' in x:
        return x.split()[0]
    else:
        return x

scamdf1["age"] = scamdf1["age"].apply(lambda x: get_age(x))

In [324]:
#removing punctuations from age, status, gender and occupation columns
scamdf1["gender"] = scamdf1["gender"].apply(lambda x: get_new_sentence(x))
scamdf1["age"] = scamdf1["age"].apply(lambda x: get_new_sentence(x))
scamdf1['occupation']= scamdf1['occupation'].apply(lambda x: get_new_sentence(x))

In [325]:
# strip whitespace
scamdf1['occupation'].str.strip()

0                      student
1                   contractor
2                     business
3          Building Contractor
4                self employed
                 ...          
4337                 sell girl
4338           self employment
4339    Flower designer course
4340                     nurse
4341                   teacher
Name: occupation, Length: 4255, dtype: object

In [326]:
# remove empty occupation
scamdf1 = scamdf1[scamdf1['occupation']!='']

In [327]:
#remove stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stop_words(description):
    
    stop_words = set(stopwords.words('english'))

    word_tokens = word_tokenize(description)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    return ' '.join(filtered_sentence)

#create a new column to store the filtered sentences
scamdf1["filtered sentence"] = scamdf1["description"].apply(lambda x: remove_stop_words(x))

In [328]:
scamdf1["y"] = 1 #1 for scam profiles
scamdf1.head()

Unnamed: 0,username,age,occupation,status,gender,description,filtered sentence,y
0,000love,33,student,single,female,i am cool simple easy going easy to please if treated with resp ect i am cool simple easy going easy to please if treated with resp ecti am cool simple easy going easy to please if treated with resp ect,cool simple easy going easy please treated resp ect cool simple easy going easy please treated resp ecti cool simple easy going easy please treated resp ect,1
1,05johnsmith,57,contractor,widowed,male,i am kind honest love and caring man my hobby are music and sport i believe in god,kind honest love caring man hobby music sport believe god,1
2,1234,57,business,widowed,male,As you know that its myriads that someone has lots of interested women in here but its my choice to choose onethe one my heart goes forsomeone i can lean onto give me the love i have always wanted to haveSo tell me about yourselfwhat you do and everything you need to tell me about yourselfand lets see how it goes from hereBut anywayHeres a little info about me I like to go out to dinner go dancinggo to the movies workout travel see new things I enjoy the simple things like the ocean cuddling at home watching a movie with someone special Im a extremely romantic passionate and affectionate woman,As know myriads someone lots interested women choice choose onethe one heart goes forsomeone lean onto give love always wanted haveSo tell yourselfwhat everything need tell yourselfand lets see goes hereBut anywayHeres little info I like go dinner go dancinggo movies workout travel see new things I enjoy simple things like ocean cuddling home watching movie someone special Im extremely romantic passionate affectionate woman,1
3,231411,53,Building Contractor,single,male,I am looking for a woman whom I think is handsome sexy and smart A perfect simple woman one who treats me well and who makes me laugh she is not afraid to grab my hand and steal a kiss in public or to just shoot a knowing stare across the room that tells me she is admiring me from afarAt this stage in my life I am focused on finding you I have a great deal to offer and will always provide a warm heart and welcoming smile I am optimistic youre out there and perhaps ifwhen you read this far in my profile it may resonate with you\nBe smart and contact me directly henshawmark is my ID and to figure the network you ddd the first alphabet of the following word in capital Yoke Apple Hot Orange Orange Let see if you solve it by writing me Had to sneak this in lol,I looking woman I think handsome sexy smart A perfect simple woman one treats well makes laugh afraid grab hand steal kiss public shoot knowing stare across room tells admiring afarAt stage life I focused finding I great deal offer always provide warm heart welcoming smile I optimistic youre perhaps ifwhen read far profile may resonate Be smart contact directly henshawmark ID figure network ddd first alphabet following word capital Yoke Apple Hot Orange Orange Let see solve writing Had sneak lol,1
4,3234714190Txtme,33,self employed,single,female,Just a note I used to be able to do a name search for people using the Meet Me feature I can no longer do that so please send a message instead or at least hit Favorite so I can find you\nI have a really good sense of humor quickthinking wit Very kindhearted and sometimes maybe too logical I love to travel have game nights bar b ques with friends and family I only drink socially have a good group of family and friends who when we do get together laugh a lot I love being active and in nature ride bikes exploring new places I love old school rock and some older country and wherever I am if I know the song playing I may accidentally start singing out loud in front of other people hopefully youll join me I cry every time I hear The National Anthem Im a simple girl and try and be as stressfree as possible Ive never been more selfaware and selfconfident in my life Very happy and excited about my future in so many ways Work love the possibilities are endless I love love a great smile on someone A good kisser is a must,Just note I used able name search people using Meet Me feature I longer please send message instead least hit Favorite I find I really good sense humor quickthinking wit Very kindhearted sometimes maybe logical I love travel game nights bar b ques friends family I drink socially good group family friends get together laugh lot I love active nature ride bikes exploring new places I love old school rock older country wherever I I know song playing I may accidentally start singing loud front people hopefully youll join I cry every time I hear The National Anthem Im simple girl try stressfree possible Ive never selfaware selfconfident life Very happy excited future many ways Work love possibilities endless I love love great smile someone A good kisser must,1


In [329]:
# detect language
from langdetect import detect
scamdf1['Language'] = scamdf1['occupation'].apply(detect)

In [330]:
# translate non english 
from googletrans import Translator, constants
from pprint import pprint

def translate(word):
    translator= Translator()
    if detect(word) != 'en':
        translation = translator.translate(word, dest='en')
        return translation.text
    return word

In [331]:
scamdf1['translated_occupation'] = scamdf1['occupation'].apply(lambda x: translate(x))

In [332]:
scamdf1.sample(10)

Unnamed: 0,username,age,occupation,status,gender,description,filtered sentence,y,Language,translated_occupation
1731,Jamesethan65,49,UN,single,male,Im Single looking for a honest woman in my lifeIm a very passionate and romantic man and not afraid to show my affections in private or public I am very clean and like seeing things and people clean as well I will always treat my woman well and with respect and love and cherish herIm very easy going and down to earthI Believe that a strong relationship is not based only on sex life but that a strong relationship builds a strong love life that leads to a strong and great sex life and can only happen with open communication trust and commitment and constantly working together and being honest with my woman,Im Single looking honest woman lifeIm passionate romantic man afraid show affections private public I clean like seeing things people clean well I always treat woman well respect love cherish herIm easy going earthI Believe strong relationship based sex life strong relationship builds strong love life leads strong great sex life happen open communication trust commitment constantly working together honest woman,1,sw,AND
2413,loisfugate,38,Self employment,single,female,am single looking for honest and caring man to spend the rest of my life with,single looking honest caring man spend rest life,1,en,Self employment
2029,john_barry,47,military,divorced,male,I am honest real and true I am a 47 years old musicloving man from California I have brown eyes and black hair my body is about average and I live alone I have children and they live at home Im looking forward to spending time with someone,I honest real true I 47 years old musicloving man California I brown eyes black hair body average I live alone I children live home Im looking forward spending time someone,1,tl,military
2528,macuscole,34,cargo agent,divorced,male,Am just a simple man looking forward in finding a serious woman who will be ready for a long term relation that can lead to marriage,Am simple man looking forward finding serious woman ready long term relation lead marriage,1,es,cargo agent
1329,garyrichard125,46,chemical engineering,single,male,am honest caring lovely understanding decent man,honest caring lovely understanding decent man,1,en,chemical engineering
316,Baleme,46,military,–,male,Im Luis Baleme an Engineer with the USA ArmyI am basically a very positive person who also loves humor and laughter and would like someone\nwith an adventurous spirit and a positive attitude that I need someone who can see the funny side of life someone who is\nopen and willing to learn new things,Im Luis Baleme Engineer USA ArmyI basically positive person also loves humor laughter would like someone adventurous spirit positive attitude I need someone see funny side life someone open willing learn new things,1,tl,military
11,abim123,52,legal,divorced,male,In few words I can say that I am serious have goals in my life but like a true woman I like to dream and make them come true I have serious profession and I am not going to stop at what Ive achieved for now but family is always on the first place for me But its impossible to be serious all the time and its not necessary I believe I like to have fun love sportlisten to good music and learn how to enjoy the simple moments in my life I am active open person with sense of humor I am strong enough to live alone but much better is to share life with someone nice,In words I say I serious goals life like true woman I like dream make come true I serious profession I going stop Ive achieved family always first place But impossible serious time necessary I believe I like fun love sportlisten good music learn enjoy simple moments life I active open person sense humor I strong enough live alone much better share life someone nice,1,hu,legal
2618,markknight,50,military,widowed,male,I think I am an open person honest if I think something I say it I never do something behind a persons back for me it is important I am calm lovely I like love and to be loved and communicationI enjoy so many things like aerobics hiking skiing walking candlelit dinners gardening reading psychology travelling movies theatres and concerts restaurants art galleries and exhibitions I like to experience new things but also like to spend time with my family play with my pet and decorate house,I think I open person honest I think something I say I never something behind persons back important I calm lovely I like love loved communicationI enjoy many things like aerobics hiking skiing walking candlelit dinners gardening reading psychology travelling movies theatres concerts restaurants art galleries exhibitions I like experience new things also like spend time family play pet decorate house,1,tl,military
1345,general_m_rodriguez,62,Army General,widowed,male,I am a simplecaringlovingGod fearingdecenthard working man and need a life time partner to build with and end of my life with cos am about to retire,I simplecaringlovingGod fearingdecenthard working man need life time partner build end life cos retire,1,de,Army General
3934,sweethoneyjean,30,nurse,single,female,I am a happy honest sincere person who enjoy life Im very organize and clean I like to hear music dancing watch TV use the computerwork in the garden go in road trips and travel I live with my daughter and take care of my grand kids after school on the week so Im not interest in relocate to other place I believe that internet relation need time and lots of interest\nam serious and i want to love and to be loved againi dont joke about this or play any head i am very serious at the beginning i was just not sure about the distance but you have changed my mind maybe there is a light at the end of the tunnel i want someone there every day someone to spend time with doing things together share everything together and more,I happy honest sincere person enjoy life Im organize clean I like hear music dancing watch TV use computerwork garden go road trips travel I live daughter take care grand kids school week Im interest relocate place I believe internet relation need time lots interest serious want love loved againi dont joke play head serious beginning sure distance changed mind maybe light end tunnel want someone every day someone spend time things together share everything together,1,ro,nurse


In [333]:
scamdf1.drop(columns = "description", inplace = True)

In [334]:
scamdf1.head()

Unnamed: 0,username,age,occupation,status,gender,filtered sentence,y,Language,translated_occupation
0,000love,33,student,single,female,cool simple easy going easy please treated resp ect cool simple easy going easy please treated resp ecti cool simple easy going easy please treated resp ect,1,nl,student
1,05johnsmith,57,contractor,widowed,male,kind honest love caring man hobby music sport believe god,1,es,contractor
2,1234,57,business,widowed,male,As know myriads someone lots interested women choice choose onethe one heart goes forsomeone lean onto give love always wanted haveSo tell yourselfwhat everything need tell yourselfand lets see goes hereBut anywayHeres little info I like go dinner go dancinggo movies workout travel see new things I enjoy simple things like ocean cuddling home watching movie someone special Im extremely romantic passionate affectionate woman,1,en,business
3,231411,53,Building Contractor,single,male,I looking woman I think handsome sexy smart A perfect simple woman one treats well makes laugh afraid grab hand steal kiss public shoot knowing stare across room tells admiring afarAt stage life I focused finding I great deal offer always provide warm heart welcoming smile I optimistic youre perhaps ifwhen read far profile may resonate Be smart contact directly henshawmark ID figure network ddd first alphabet following word capital Yoke Apple Hot Orange Orange Let see solve writing Had sneak lol,1,en,Building Contractor
4,3234714190Txtme,33,self employed,single,female,Just note I used able name search people using Meet Me feature I longer please send message instead least hit Favorite I find I really good sense humor quickthinking wit Very kindhearted sometimes maybe logical I love travel game nights bar b ques friends family I drink socially good group family friends get together laugh lot I love active nature ride bikes exploring new places I love old school rock older country wherever I I know song playing I may accidentally start singing loud front people hopefully youll join I cry every time I hear The National Anthem Im simple girl try stressfree possible Ive never selfaware selfconfident life Very happy excited future many ways Work love possibilities endless I love love great smile someone A good kisser must,1,en,self employed


In [335]:
# check for rows that have no descriptions
scamdf1[scamdf1["filtered sentence"] == '']

Unnamed: 0,username,age,occupation,status,gender,filtered sentence,y,Language,translated_occupation
20,adalhardbartholo,42,Oil and Gas consultant,single,male,,1,en,Oil and Gas consultant
26,adamsandler,53,doctor,single,male,,1,es,doctor
30,adamslilly,51,laywer,widowed,female,,1,cy,lawyer
31,adamsmorgan,46,manufacturing,single,male,,1,ro,manufacturing
39,Adriana200,30,self employed,single,female,,1,en,self employed
...,...,...,...,...,...,...,...,...,...
4199,walterdaniel,60,Construction engineer,widowed,male,,1,en,Construction engineer
4281,williamtom584,42,construction,single,male,,1,fr,construction
4297,wilsonmorgan,48,engineer,widowed,male,,1,nl,engineer
4329,yolly36,36,military,single,female,,1,tl,military


In [336]:
# remove rows that have no descriptions
scamdf1 = scamdf1[scamdf1["filtered sentence"] != ''].copy()

In [339]:
# check empty occupation again
scamdf1[scamdf1['occupation']=='']

Unnamed: 0,username,age,occupation,status,gender,filtered sentence,y,Language,translated_occupation


In [340]:
# removing occupation value count ==1 
scamdf1['occupation'].value_counts()

temp=scamdf1['occupation'].tolist()
scamdf1 = scamdf1[scamdf1['occupation'].apply(lambda x: temp.count(x)>1)].copy()

In [342]:
# removing status value count ==1 
scamdf1['status'].value_counts()

tempstatus=scamdf1['status'].tolist()
scamdf1 = scamdf1[scamdf1['status'].apply(lambda x: tempstatus.count(x)>1)].copy()

In [343]:
# strip whitespace in status column
scamdf1['status']=scamdf1['status'].str.strip()

In [341]:
# check value counts for status column
scamdf1['status'].value_counts()

 single                                1540
 widowed                                835
 divorced                               418
 –                                       53
 separated                               22
 widower                                  8
 Widowed                                  8
 Divorced                                 6
 widow                                    4
 single or divorced                       2
 married or divorced                      1
 divorced or widowed                      1
 married (and looking for marriage)       1
 divorce                                  1
 windoew                                  1
 singles                                  1
 Single                                   1
 seprated                                 1
 married                                  1
 Separated                                1
Name: status, dtype: int64

In [345]:
# remove empty status
scamdf1=scamdf1[scamdf1['status']!='–'].copy()

In [348]:
# check for null status values
scamdf1['status'].value_counts()

single                1540
widowed                835
divorced               418
separated               22
widower                  8
Widowed                  8
Divorced                 6
widow                    4
single or divorced       2
Name: status, dtype: int64

In [349]:
# check null values
scamdf1.isna().sum()

username                 0
age                      0
occupation               0
status                   2
gender                   0
filtered sentence        0
y                        0
Language                 0
translated_occupation    0
dtype: int64

In [350]:
# drop null rows
scamdf1.dropna(inplace=True)

In [351]:
scamdf1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2843 entries, 0 to 4341
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   username               2843 non-null   object
 1   age                    2843 non-null   object
 2   occupation             2843 non-null   object
 3   status                 2843 non-null   object
 4   gender                 2843 non-null   object
 5   filtered sentence      2843 non-null   object
 6   y                      2843 non-null   int64 
 7   Language               2843 non-null   object
 8   translated_occupation  2843 non-null   object
dtypes: int64(1), object(8)
memory usage: 222.1+ KB


In [247]:
# final 2843 rows
scamdf1.to_csv('scam_profile.csv', header=True, index=False)

# Real df data cleaning
- Select only relevant columns - username, gender, age, occupation, status, description
- For description, 
    - if description == '-':
        - description = ethnicity + children + orientation + religion + smoking + drinking + intent
    - remove punctuations 
    - check how many non english descriptions 

In [285]:
realdf.head()

Unnamed: 0,username,age,occupation,status,gender,filtered sentence,Language,y
26,123canwe,66,Retired,single,male,I full fire friskier hell,tr,0
30,123WILFREDO,28,ESTUDIANTE,single,male,ME GUSTA CONOCER CHICAS BONDADOSAS CARIÑOSAS Y MUY FUERTES EN LA MORAL,es,0
44,1907,48,Construction,single,male,laid back earth good sense humor,en,0
105,52Jim52,70,Retired,divorced,male,Retired owner aerospace consuloting firm,da,0
146,Aaron90,28,Social worker,single,male,Hello aaron Vermont I vermont 2 years Im really fun person I like camping bone fire etc,en,0


In [180]:
realdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17724 entries, 0 to 17723
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   gender       17724 non-null  object
 1   age          17724 non-null  object
 2   location     17724 non-null  object
 3   status       17724 non-null  object
 4   username     17723 non-null  object
 5   ethnicity    17724 non-null  object
 6   occupation   17724 non-null  object
 7   description  17724 non-null  object
 8   match_age    17724 non-null  object
 9   children     17724 non-null  object
 10  orientation  17724 non-null  object
 11  religion     17724 non-null  object
 12  smoking      17724 non-null  object
 13  drinking     17724 non-null  object
 14  intent       17724 non-null  object
 15  site         17724 non-null  object
dtypes: object(16)
memory usage: 2.2+ MB


In [181]:
realdf[realdf['description']!=''].count()

gender         17724
age            17724
location       17724
status         17724
username       17723
ethnicity      17724
occupation     17724
description    17724
match_age      17724
children       17724
orientation    17724
religion       17724
smoking        17724
drinking       17724
intent         17724
site           17724
dtype: int64

In [182]:
realdf2 = realdf[realdf["description"] == "-"].copy()

In [183]:
realdf2["description"] = realdf2["ethnicity"] +' '+ realdf2["orientation"] +' '+ realdf2["religion"] +' '+ realdf2["smoking"] +' '+ realdf2["drinking"] +' '+ realdf2["intent"]
realdf2.head(10)


Unnamed: 0,gender,age,location,status,username,ethnicity,occupation,description,match_age,children,orientation,religion,smoking,drinking,intent,site
0,female,39 y.o.,"Johannesburg, South Africa",single,00,black,Frailcare nurse,black Straight Christian non-smoker social drinker Romance,from 36 to 50,1-2 living with me,Straight,Christian,non-smoker,social drinker,Romance,realprofile\real00.json
1,female,43 y.o.,"Manizales, Caldas, Colombia",separated,0065patricia,native american,Administradora,native american Bisexual Other non-smoker never Serious Relationship,from 19 to 86,more than 2 living elsewhere,Bisexual,Other,non-smoker,never,Serious Relationship,realprofile\real0065patricia.json
2,male,63 y.o.,"Lima, Perú",divorced,007,white,comercio,white Straight Christian non-smoker occasional drinker Friendship,from 19 to 86,1-2 living elsewhere,Straight,Christian,non-smoker,occasional drinker,Friendship,realprofile\real007.json
3,male,59 y.o.,"London, UK",divorced,0077654321,white,Developer,"white Straight Atheist non-smoker occasional drinker Fun, Friendship, Romance, Serious Relationship",from 48 to 65,1-2 living elsewhere,Straight,Atheist,non-smoker,occasional drinker,"Fun, Friendship, Romance, Serious Relationship",realprofile\real0077654321.json
4,male,30 y.o.,"Sucre, Bolivia",single,00alex00,hispanic,Guardia de seguridad,"hispanic Straight Other non-smoker never Fun, Friendship, Romance",from 19 to 86,don't want children,Straight,Other,non-smoker,never,"Fun, Friendship, Romance",realprofile\real00alex00.json
5,male,32 y.o.,"Colinas de Cerro Viento, Panamá",single,00lucero90,hispanic,Ingeniero naval,"hispanic Straight Christian social smoker social drinker Fun, Friendship, Romance",from 19 to 86,no children,Straight,Christian,social smoker,social drinker,"Fun, Friendship, Romance",realprofile\real00lucero90.json
7,male,52 y.o.,"Bucaramanga, Santander, Colombia",divorced,01Anselmo212,hispanic,Ingeniero,hispanic Straight Christian non-smoker never Serious Relationship,from 30 to 40,1-2 living elsewhere,Straight,Christian,non-smoker,never,Serious Relationship,realprofile\real01Anselmo212.json
8,male,49 y.o.,"Miami, Florida, EE. UU.",divorced,0279602,hispanic,Mantenimiento,hispanic Straight Other non-smoker never Serious Relationship,from 33 to 50,1-2 living elsewhere,Straight,Other,non-smoker,never,Serious Relationship,realprofile\real0279602.json
10,male,40 y.o.,"Chicago, IL, USA",single,03Robert,hispanic,-,"hispanic Straight Other light smoker social drinker Friendship, Romance, Serious Relationship",from 28 to 41,want children,Straight,Other,light smoker,social drinker,"Friendship, Romance, Serious Relationship",realprofile\real03Robert.json
11,male,49 y.o.,"Arequipa, Perú",single,06Wil09,hispanic,Docente,"hispanic Straight Christian non-smoker never Fun, Romance, Marriage",from 21 to 40,no children,Straight,Christian,non-smoker,never,"Fun, Romance, Marriage",realprofile\real06Wil09.json


In [184]:
#combine realdf2 with realdf3 
realdf3 = realdf[realdf["description"] != '-']
realdf4 = pd.concat([realdf3, realdf2])
len(realdf4)

17724

In [185]:
realdf4.head()

Unnamed: 0,gender,age,location,status,username,ethnicity,occupation,description,match_age,children,orientation,religion,smoking,drinking,intent,site
6,male,58 y.o.,"San Salvador, El Salvador",divorced,0119162866,hispanic,Negocio,"Honesto. Trabajador, divertido. Me gusta la naturaleza, viajar, bailar.",from 30 to 42,no children,Straight,Spiritual,social smoker,never,Friendship,realprofile\real0119162866.json
9,male,38 y.o.,"Boaco, Nicaragua",single,02erick,hispanic,Decoraciones eventos privados,Soy un hombre sociable cariñoso y me gusta conoser personas y tener la oportunidad de establecer una relación en algún momento,from 24 to 39,want children,Straight,Christian,social smoker,occasional drinker,"Fun, Friendship, Romance, Serious Relationship",realprofile\real02erick.json
18,male,35 y.o.,"Bogota, Colombia",single,1047373361es,native american,Electrcista,Honesta honrrada y trabajadora y muy alegre amigable,from 30 to 50,no children,Straight,Other,non-smoker,never,"Serious Relationship, Marriage",realprofile\real1047373361es.json
22,female,40 y.o.,"Machala, Ecuador",single,12345bonita,black,-,Soy negra mido metro 150y soy gordita,from 19 to 86,more than 2 living with me,Bisexual,Christian,non-smoker,never,Serious Relationship,realprofile\real12345bonita.json
26,male,66 y.o.,"San Jose, CA, USA",single,123canwe,white,Retired,I am full of fire and friskier than hell!!!!,from 19 to 64,don't want children,Straight,Christian,non-smoker,never,"Fun, Friendship, Romance, Serious Relationship",realprofile\real123canwe.json


In [188]:
#select relevant columns
realdf = realdf5[["username","age","occupation","status","gender","description"]].copy()

#remove 'y.o' from age
def get_age(x):
    return x.split('y')[0]
     
realdf["age"] = realdf["age"].apply(lambda x: get_age(x))

#remove punctuation
import re
def get_new_sentence(x):
    return re.sub(r'[^\w\s]', '', str(x)).strip()

realdf["description"] = realdf["description"].apply(lambda x: get_new_sentence(x))

In [189]:
#removing punctuations from age, status, gender and occupation columns
realdf["gender"] = realdf["gender"].apply(lambda x: get_new_sentence(x))
realdf["age"] = realdf["age"].apply(lambda x: get_new_sentence(x))
realdf["status"] = realdf["status"].apply(lambda x: get_new_sentence(x))
realdf["occupation"] = realdf["occupation"].apply(lambda x: get_new_sentence(x))

In [193]:
# strip whitespace
realdf['occupation'].str.strip()

26               Retired
30            ESTUDIANTE
44          Construction
59       single engineer
63                      
              ...       
17718                   
17719          Profesora
17720                Spa
17722               Cook
17723                   
Name: occupation, Length: 13948, dtype: object

In [199]:
# remove empty occupation
realdf = realdf[realdf['occupation']!='']

In [202]:
realdf['occupation'].tolist()

['Retired',
 'ESTUDIANTE',
 'Construction',
 'single engineer',
 'Social Media Executive',
 'Auto engineer',
 'medical field',
 'Retired',
 'Welder Pipeline Sell Real Estate',
 'Hardworker',
 'Social worker',
 'Production Worker',
 'System Analyst',
 'retired',
 'Executive',
 'journalist',
 'Actor',
 'Community Manager',
 'Designer',
 'support engineer',
 'Retired',
 'Construction worker driver',
 'Boss',
 'MEDICO CIRUJANO ESPECIALISTA EN 3 RAMAS DE LA MEDICINA',
 'Logistic Company',
 'Chef',
 'Retired lecturer',
 'Entrepreneur',
 'Elementary teacher',
 'Hvac engineering technician',
 'Student',
 'Graphic Designer Artist',
 'Retired',
 'Medical',
 'Sales',
 'Service',
 'Author',
 'Self emplyed',
 'Engineer',
 'retired',
 'electric power grids',
 'Teacher',
 'Care worker',
 'Rental company',
 'engineer industrial',
 'Cobranza',
 'Chofer',
 'Especialista en Ultrasonido Cardiaco',
 'Lawyer',
 'Airport Security',
 'Engineer',
 'Fast food place',
 'POS Tech support',
 'Disabled',
 'Life Coa

In [200]:
realdf.isna().sum()

username             0
age                  0
occupation           0
status               0
gender               0
description          0
filtered sentence    0
dtype: int64

In [201]:
#remove stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stop_words(description):
    
    stop_words = set(stopwords.words('english'))

    word_tokens = word_tokenize(description)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    return ' '.join(filtered_sentence)

#create a new column to store the filtered sentences
realdf["filtered sentence"] = realdf["description"].apply(lambda x: remove_stop_words(x))

In [205]:
from langdetect import detect
realdf['Language'] = realdf['occupation'].apply(lambda x: detect(x))


In [206]:
realdf['Language'].value_counts()

es       1923
en       1634
pt        899
it        824
de        552
nl        413
af        308
ro        282
da        272
fr        236
tl        226
ca        200
id        194
cy        164
so        154
no        129
fi        125
tr         99
pl         55
et         53
hr         50
lt         48
sw         44
sv         44
sl         32
sk         19
ru         19
vi         15
hu         13
cs          8
lv          7
mk          6
sq          5
bg          4
uk          3
zh-cn       1
Name: Language, dtype: int64

In [217]:
# check no. of english rows
realdf[realdf['Language']=='en'].count()

username             798
age                  798
occupation           798
status               798
gender               798
filtered sentence    798
Language             798
y                    798
dtype: int64

In [287]:
realdf['translated_occupation'] = realdf['occupation'].apply(lambda x: translate(x))

In [289]:
realdf.sample(10)

Unnamed: 0,username,age,occupation,status,gender,filtered sentence,Language,y,translated_occupation
3023,Catherin,33,Administrador,divorced,female,white Straight Spiritual nonsmoker social drinker Friendship Serious Relationship,pt,0,Administrator
6989,Hmo,51,Construccion,separated,male,hispanic Straight Other nonsmoker never Serious Relationship,es,0,Building
14027,Quique79,43,Fotógrafo,single,male,hispanic Straight Other nonsmoker never Fun Friendship Romance,pt,0,Photographer
15284,ScottEW,34,Student,single,male,white Straight Other social smoker social drinker Fun Friendship Romance Serious Relationship,de,0,Student
12240,Mishely,23,Ama de casa,single,female,black Straight Atheist nonsmoker never Serious Relationship,pt,0,Housewife
1037,Amando90,31,Installer,single,male,hispanic Straight Other social smoker never Fun Friendship Romance,no,0,Installer
8693,joselopez,27,estudiante universitario,single,male,hispanic Straight Christian nonsmoker never Serious Relationship,it,0,college student
15910,Steven86,35,Tatuador,separated,male,hispanic Straight Spiritual nonsmoker occasional drinker Fun Friendship Romance,pt,0,tattoo artist
8806,jostinfer93,28,emprendedor,single,male,hispanic Bisexual Spiritual nonsmoker occasional drinker Friendship Romance Serious Relationship Marriage,pt,0,entrepreneur
14581,Robert999,47,Taxista,separated,male,white Straight Other social smoker social drinker Fun Friendship Romance Serious Relationship,so,0,Cabby


In [208]:
realdf.drop(columns = "description", inplace = True)

In [209]:
realdf["y"] = 0
realdf.head()

Unnamed: 0,username,age,occupation,status,gender,filtered sentence,Language,y
26,123canwe,66,Retired,single,male,I full fire friskier hell,tr,0
30,123WILFREDO,28,ESTUDIANTE,single,male,ME GUSTA CONOCER CHICAS BONDADOSAS CARIÑOSAS Y MUY FUERTES EN LA MORAL,es,0
44,1907,48,Construction,single,male,laid back earth good sense humor,en,0
59,1engineer,37,single engineer,single,male,I like travel watch movies read books sports different kind activities I passion life love among people people,af,0
86,2sk00pz,49,Social Media Executive,single,male,Im friendly easygoing cheerful guy loves laugh familyorientated well loyal kind Id love find forever woman,ro,0


In [211]:
# removing occupation value count ==1 
realtemp=realdf['occupation'].tolist()
realdf=realdf[realdf['occupation'].apply(lambda x: realtemp.count(x)>1)].copy()

In [290]:
# check null occupation
realdf[realdf['occupation']=='-']

Unnamed: 0,username,age,occupation,status,gender,filtered sentence,Language,y,translated_occupation


In [291]:
# checking value_counts for status
realdf['status'].value_counts()

single             3498
divorced           1048
separated           699
widowed             179
in relationship     110
married              75
Name: status, dtype: int64

In [214]:
# checking value_counts for gender
realdf['gender'].value_counts()

male      3841
female    1768
Name: gender, dtype: int64

In [292]:
realdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5609 entries, 26 to 17722
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   username               5609 non-null   object
 1   age                    5609 non-null   object
 2   occupation             5609 non-null   object
 3   status                 5609 non-null   object
 4   gender                 5609 non-null   object
 5   filtered sentence      5609 non-null   object
 6   Language               5609 non-null   object
 7   y                      5609 non-null   int64 
 8   translated_occupation  5609 non-null   object
dtypes: int64(1), object(8)
memory usage: 438.2+ KB


In [293]:
# final 5609 rows
realdf.to_csv('real_profile.csv', header=True, index=False)

# combined dataset

In [352]:
final_df = pd.concat([realdf,scamdf1])
final_df.head()

Unnamed: 0,username,age,occupation,status,gender,filtered sentence,Language,y,translated_occupation
26,123canwe,66,Retired,single,male,I full fire friskier hell,tr,0,Retired
30,123WILFREDO,28,ESTUDIANTE,single,male,ME GUSTA CONOCER CHICAS BONDADOSAS CARIÑOSAS Y MUY FUERTES EN LA MORAL,es,0,STUDENT
44,1907,48,Construction,single,male,laid back earth good sense humor,en,0,Construction
105,52Jim52,70,Retired,divorced,male,Retired owner aerospace consuloting firm,da,0,Retired
146,Aaron90,28,Social worker,single,male,Hello aaron Vermont I vermont 2 years Im really fun person I like camping bone fire etc,en,0,Social worker


In [354]:
final_df['status'].value_counts()

single                5038
divorced              1466
widowed               1014
separated              721
in relationship        110
married                 75
widower                  8
Widowed                  8
Divorced                 6
widow                    4
single or divorced       2
Name: status, dtype: int64

In [359]:
# perform status lemmatization
import gensim
from gensim.utils import simple_preprocess
import nltk.corpus
import nltk
import gensim.corpora
from nltk.corpus import stopwords
import numpy
import time
from nltk.stem import WordNetLemmatizer

#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()


In [360]:
final_df['status']= final_df['status'].apply(lambda x: x.lower())
final_df['status'].nunique()

9

In [361]:
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = wordnet_lemmatizer.lemmatize(text)
    return lemm_text

final_df['status_lem']=final_df['status'].apply(lambda x:lemmatizer(x))

In [383]:
# limitation: lemmatization did not work as widower/widow was not lemmatized
final_df['status_lem'].value_counts()

single                5038
divorced              1472
widowed               1022
separated              721
in relationship        110
married                 75
widower                  8
widow                    4
single or divorced       2
Name: status_lem, dtype: int64

In [390]:
# manually replace widower/widow = widowed
final_df.replace({'status_lem':{'widow':'widowed'}}, inplace=True)
final_df.replace({'status_lem':{'widower':'widowed'}}, inplace=True)

In [393]:
final_df['status_lem'].value_counts()

single                5038
divorced              1472
widowed               1034
separated              721
in relationship        110
married                 75
single or divorced       2
Name: status_lem, dtype: int64

In [394]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8452 entries, 26 to 4341
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   username               8452 non-null   object
 1   age                    8452 non-null   object
 2   occupation             8452 non-null   object
 3   status                 8452 non-null   object
 4   gender                 8452 non-null   object
 5   filtered sentence      8452 non-null   object
 6   Language               8452 non-null   object
 7   y                      8452 non-null   int64 
 8   translated_occupation  8452 non-null   object
 9   status_lem             8452 non-null   object
dtypes: int64(1), object(9)
memory usage: 726.3+ KB


In [303]:
#read to csv file
final_df.to_csv('combined_profile.csv', header=True, index=False)

# 4. Distribution plots for categorical variables 

## 4.1 Age distribution of real profiles

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px

counts = realdf["Age"].value_counts(normalize = True)

fig = px.bar(counts, title="Age of real profiles")
fig.update_layout(
    xaxis_title = "Age",
    yaxis_title = "Number of profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px

counts = scamdf1["age"].value_counts(normalize = True)

fig = px.bar(counts, title="Age of scam profiles")
fig.update_layout(
    xaxis_title = "Age",
    yaxis_title = "Number of profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

## 4.1 Gender distribution 

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px

counts = realdf["gender"].value_counts(normalize = True)

fig = px.bar(counts, title="Gender distribution of real profiles")
fig.update_layout(
    xaxis_title = "Gender",
    yaxis_title = "Percentage of profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px

counts = scamdf["gender"].value_counts(normalize = True)

fig = px.bar(counts, title="Gender distribution of scam profiles")
fig.update_layout(
    xaxis_title = "Gender",
    yaxis_title = "Percentage of profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

## 4.6 Status

In [None]:
counts = realdf["status"].value_counts(normalize = True)

fig = px.bar(counts, title="Status of real profiles")
fig.update_layout(
    xaxis_title = "Status",
    yaxis_title = "Percentage of real profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

In [None]:
counts = scamdf["status"].value_counts(normalize = True)

fig = px.bar(counts, title="Status of scam profiles")
fig.update_layout(
    xaxis_title = "Status",
    yaxis_title = "Percentage of scam profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

In [None]:
counts = scamdf["occupation"].value_counts(normalize = True)

fig = px.bar(counts, title="Occupation of scam profiles")
fig.update_layout(
    xaxis_title = "Occupations",
    yaxis_title = "Percentage of scam profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

In [None]:
counts = realdf["occupation"].value_counts(normalize = True)

fig = px.bar(counts, title="Occupation of real profiles")
fig.update_layout(
    xaxis_title = "Occupations",
    yaxis_title = "Percentage of real profiles",
    title_x = 0.5,
    showlegend = False
)

fig.show()

## Get Image name of selected scam profiles

In [None]:
import json
import os
import glob
import pandas as pd
import numpy as np

data = pd.read_csv("scam_profile.csv")
data

In [None]:
json_dir = 'data/scamprofile'
json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

dfs = []
for file in file_list:
    with open(file) as f:
        json_data = pd.json_normalize(json.loads(f.read()))
        json_data['site'] = file.rsplit("/", 1)[-1]
    dfs.append(json_data)
scamdf = pd.concat(dfs)

In [None]:
scamdf

In [None]:
selected = np.array(scamdf[scamdf['username'].isin(data['username'])]['images'])
pd.DataFrame(scamdf[scamdf['username'].isin(data['username'])]['images'].explode()).to_csv("selected_scam_images.csv",index=False)
# pd.read_csv("selected_scam_images.csv")

## Selected Real Profile Images

In [None]:
data1 = pd.read_csv("real_profile.csv")

pd.DataFrame(data1['username']+".jpg").to_csv("selected_real_images.csv",index=False)

In [None]:
pd.read_csv("selected_real_images.csv")