In [1]:
import pandas as pd
import numpy as np
import requests
import re
import nltk
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer

In [10]:
df = pd.DataFrame(columns=['username', 'followers', 'following', 'gender', 'dob', 'email', 'phone', 'nationality'])

In [11]:
def contains_number(s):
    return any(char.isdigit() for char in s)

def check_phone(data):
    regex = r'\b\d{10}\b'
    for s in data:
        check = re.search(regex, s)
        if check:
            return check.group()
        
    return ''

def check_gender(data):
    gender_filters = [
        r'\bhe\b',            # he
        r'\bhim\b',           # him
        r'\bshe\b',           # she
        r'\bher\b',           # her
        r'\bMale\b',          # Male
        r'\bFemale\b',        # Female
        r'\bLGBTQ\b',         # LGBTQ
        r'\bLGBTQAI\+\b',     # LGBTQAI+
    ]
    
    for regex in gender_filters:
        for s in data:
            check = re.search(regex, s, re.IGNORECASE)
            if check:
                return check.group()
            
    return ''

def check_dob(data):
    regex_patterns = [
        r'\b\d{2}/\d{2}/\d{4}\b',  # dd/mm/yyyy
        r'\b\d{2}/\d{2}/\d{2}\b',  # dd/mm/yy
        r'\b\d{1,2} [A-Za-z]+ \d{4}\b',  # 03 October 2002
        r'\b\d{2}/\d{2}/\d{4}\b',  # mm/dd/yyyy
        r'\b\d{4}/\d{2}/\d{2}\b',  # yyyy/mm/dd
        r'\b\d{4}/\d{2}/\d{2}\b',  # yyyy/dd/mm
        r'\b\d{1,2} [A-Za-z]+ \d{4}\b',  # 3 October 2002
        r'\b[A-Za-z]+ \d{1,2}, \d{4}\b',  # October 3, 2022
        r'\b\d{1,2}/\d{2}/\d{2}\b',  # d/mm/yy
        r'\b\d{1,2}/\d{2}/\d{4}\b',  # d/mm/yyyy
    ]
    
    for regex in regex_patterns:
        for s in data:
            check = re.search(regex, s)
            if check:
                return check.group()
    
    return ''

def check_email(data):
    regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'
    for s in data:
        check = re.search(regex, s)
        if check:
            return check.group()
    
    return ''
    
def check_country(non_numeric):
    regex = r"india|indian|🇮🇳|hindustan|bharat|hindi|ind"
    for s in non_numeric:
        check = re.search(regex, s, re.IGNORECASE)
        if check:
            return 'Indian'
        
    indian_states = [
        "Andhra Pradesh", "Arunachal Pradesh", "Assam", "Bihar", "Chhattisgarh",
        "Goa", "Gujarat", "Haryana", "Himachal Pradesh", "Jharkhand", "Karnataka",
        "Kerala", "Madhya Pradesh", "Maharashtra", "Manipur", "Meghalaya", "Mizoram",
        "Nagaland", "Odisha", "Punjab", "Rajasthan", "Sikkim", "Tamil Nadu", "Telangana",
        "Tripura", "Uttar Pradesh", "Uttarakhand", "West Bengal",
        "Andaman and Nicobar Islands", "Chandigarh", "Dadra and Nagar Haveli",
        "Daman and Diu", "Lakshadweep", "Delhi", "Puducherry"
    ]

    regex2 = r'\b(?:' + '|'.join(indian_states) + r')\b'
    for s in non_numeric:
        check = re.search(regex2, s, re.IGNORECASE)
        if check:
            return 'Indian'
    
    return ''

def nlp_processing(non_numeric):
    for i in range(len(non_numeric)):
        txt = re.sub('[^a-zA-Z]', ' ', non_numeric[i])
        txt = txt.lower()
        txt = txt.split()
        wl = WordNetLemmatizer()
        txt = [wl.lemmatize(word) for word in txt if not word in set(stopwords.words('english'))]
        txt = ' '.join(txt)
        non_numeric[i] = txt
        
    return non_numeric

def extractData(biography):
    keywords = biography.split('\n')
    dob = check_dob(keywords)
    email = check_email(keywords)
    numeric, non_numeric = [], []
    for txt in keywords:
        if contains_number(txt):
            numeric.append(txt)
        else:
            non_numeric.append(txt)
    
    phone = check_phone(numeric)
    country = check_country(non_numeric)
    processed_non_numeric = nlp_processing(non_numeric)
    gender = check_gender(processed_non_numeric)
    
    return gender, dob, email, phone, country

def getUserProfile(username):
    url = "http://127.0.0.1:8000/instagram/profile/"
    body = {
        "username": username
    }
    try:
        r = requests.post(url, json=body)
        data= r.json()
        gender, dob, email, phone, country = extractData(data['biography'])
        df.loc[len(df)] = [username, data['followers'], data['following'], gender, dob, email, phone, country]
        return len(df)
    except:
        return 'Username does not exist'

In [12]:
jsonData = pd.read_json('../data/profileData.json')
jsonData.drop_duplicates(subset="username", keep="first", inplace=True)
jsonData.head()

Unnamed: 0,likes,comment,replies,username
0,0,Worst company to@deal with is Bajajallianz,1,manojmonga2000
1,0,Check what people has to say about them here -...,0,shubhadeeproychowdhury
3,0,I want to register.,0,chetantheshowstopper
4,0,Think before investing your going to be be sca...,0,syed_latif234
5,0,Amazing👍,0,ankita_vasoya2


In [13]:
for username in jsonData['username']:
    getUserProfile(username)

df.head()

Unnamed: 0,username,followers,following,gender,dob,email,phone,nationality
0,panchalsandip5,851,4900,,,,,
