In [1]:
#Testing Instagram Crawler
#Medium Blog useful
#https://medium.com/@adamaulia/crawling-instagram-using-instalooter-2791edb453ff
#Documentation
#https://instalooter.readthedocs.io/en/latest/instalooter/index.html

#from instalooter.looters import ProfileLooter
#looter = ProfileLooter('eilex_kyp')
from instalooter.looters import HashtagLooter
import os
import json
from urllib.request import urlretrieve
from datetime import datetime
import regex as re
import sqlalchemy
import pandas as pd
from textblob import TextBlob
from instaloader import Instaloader
from instaloader import Hashtag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import config
import helper_functions


  readline_hook.enable(use_pyreadline=use_pyreadline)


In [2]:
def r(insta):
    fashion_att = helper_functions.get_fashion_attributes()

    instaDF = pd.DataFrame(insta)
    # Preprocess metadata
    instaDF['processed_metadata'] = instaDF['description'].apply(helper_functions.preprocess_metadata)
    # Preprocess query
    instaDF['query'] = instaDF['query'].apply(lambda row: ' '.join(helper_functions.preprocess_words(row.split())))

    ## Calculate a factor for tokens that appear in metatdata
    keywords = instaDF[0]['query']
    instaDF['factor'] = instaDF['processed_metadata'].apply(
        lambda row: len(set([word for word in row if word in keywords])) / len(keywords))

    ## Calculate a factor based on the cosine similarity of TFIDF transformation of the query terms and
    # the processed metadata using the fashion expert terminology as vocabulary
    vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 1))
    vectorizer.fit_transform(fashion_att)
    tfidf_vector = vectorizer.transform(instaDF['processed_metadata'])
    query_vector = vectorizer.transform(instaDF['query'])

    ## Calculate cosine similarity
    cosine_vector = cosine_similarity(query_vector[0], tfidf_vector)
    instaDF['cosine ranking score'] = np.hstack(cosine_vector).tolist() * instaDF['factor']

    ## Calculate a factor based on Pinterest's recommendation (order of result parsing)
    scaler = MinMaxScaler()
    pinterest_score = scaler.fit_transform(np.arange(len(instaDF)).reshape(-1, 1))
    instaDF.loc[instaDF.sort_values(by='timestamp', ascending=False).index, 'pinterest score'] = pinterest_score

    ## Calculate Final Ranking Score giving the cosine similarity factor a greater score than the
    # factor based on the Pinterest recommendation
    instaDF['final score'] = (instaDF['cosine ranking score'] * 0.7) + (instaDF['pinterest score'] * 0.3)
    instaDF.sort_values(by='final score', ascending=False, inplace=True)

    # Save ranked results to the database
    for _, row in instaDF.iterrows():
        site = 'Instagram'
        searchwords = ''.join(keywords.split())
        imageFilePath = row['imageFilePath']
        url = row['URL']
        imgURL = row['imgURL']
        empPhoto = helper_functions.getImage(imgURL, imageFilePath)
        head = row['title']
        meta = row['description']
        helper_functions.addNewProduct(site,
                                       searchwords,
                                       imageFilePath,
                                       empPhoto,
                                       url,
                                       imgURL,
                                       head,
                                       None,
                                       None,
                                       None,
                                       meta,
                                       None,
                                       None)



In [3]:
if __name__ == '__main__':
    ########################################### SEARCH PATH KEYWORDS ###########################################
    currendDir = helper_functions.WEB_CRAWLERS
    engine = helper_functions.ENGINE
    dbName = helper_functions.DB_NAME

    ########################################### Open the file with read only permit ###########################################
    file = open(os.path.join(currendDir, 'keywords.txt'), "r")

    ########################################### Use readlines to read all lines in the file ###########################################
    lines = file.readlines()  # The variable "lines" is a list containing all lines in the file
    file.close()  # Close the file after reading the lines.


    ########################################### SCRAPE IMAGES FOR EVERY ENTRY IN KEYWORDS ###########################################
    for i in range(0, len(lines)):
        keys = lines[i]
        keys = keys.replace('\n', '')
        print("Crawler Search no." + str(i + 1) + ' ------------------- Search query: "' + str(keys) + '"')  #

        keywords = keys.split(" ")
        keyLen = len(keywords)
        keyUrl = keywords[1].strip('"')
        breakNumber = int(keywords[0])
        for j in range(2, keyLen):
            keyUrl = keyUrl + ' ' + keywords[j].strip('"')

        print('Query: ' + str(keyUrl))
        print("Number of crawled images wanted: " + str(breakNumber))

        ########################################### Scraper / Hashtag ###########################################
        search = keyUrl.replace(' ','')
        hashtagtext = search.replace('-','')

        threshold = breakNumber
        # productsDF = pd.read_sql_query('''SELECT * FROM S4F.dbo.Product''', engine)
        productsDF = pd.read_sql_query('''SELECT * FROM  public.\"Product\"''', engine)
        L = Instaloader()
        L.login(config.INSTAGRAM_USERNAME, config.INSTAGRAM_PASSWORD)
        hashtag = Hashtag.from_name(L.context, hashtagtext)
        count = 0
        insta = []

        for post in hashtag.get_posts():
            count = count + 1
            post_url = "https://www.instagram.com/p/" + str(post.shortcode) + "/"
            imgsource = post.url
            testdf = productsDF.loc[productsDF['URL'] == post_url]
            video = post.is_video
            # print(helper_functions.setImageFilePath(post_url, hashtagtext,count))
            imageFilePath = helper_functions.setImageFilePath(post_url, hashtagtext,count)
            if testdf.empty and not video:
                post_info = " ".join(re.findall("[a-zA-Z]+", post.caption))
                post_hashtags = post.caption_hashtags
                post_likes = post.likes
                post_date = post.date
                insta.append(({'query': keyUrl,
                               'timestamp': post_date,
                               'url': post_url,
                               'imgURL': imgsource,
                               'imageFilePath':imageFilePath,
                               'title': None,
                               'description': post_info}))
            if count > threshold:
                break
                
        fashion_att = helper_functions.get_fashion_attributes()


Crawler Search no.1 ------------------- Search query: "10 black shirts"
Query: black shirts
Number of crawled images wanted: 10


In [4]:
instaDF = pd.DataFrame(insta)
instaDF

Unnamed: 0,query,timestamp,url,imgURL,imageFilePath,title,description
0,black shirts,2020-12-17 07:53:08,https://www.instagram.com/p/CI5CRDSp5aD/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,Black and Blue Blackshirts ties Dapper beard w...
1,black shirts,2020-12-17 06:05:29,https://www.instagram.com/p/CI418jYBWDp/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,Lovely cotton linen tops Available in black pi...
2,black shirts,2020-12-17 04:50:08,https://www.instagram.com/p/CI4tUu3hiDV/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,Ative T Shirt Black Version Coronavirus Christ...
3,black shirts,2020-12-17 02:43:19,https://www.instagram.com/p/CI4ez25pWT_/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,blackshirts blackshort photography
4,black shirts,2020-12-17 01:39:03,https://www.instagram.com/p/CI4XdM1hrkX/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,Only printshop in town that can print on dark ...
5,black shirts,2020-12-16 23:07:03,https://www.instagram.com/p/CI4GD6qFA4B/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,shefashionapparel shefashionapparelg multibran...
6,black shirts,2020-12-16 22:05:06,https://www.instagram.com/p/CI3--HKnkKq/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,Shop link in bio blackshirt blackshirts croche...
7,black shirts,2020-12-16 21:40:09,https://www.instagram.com/p/CI38HaSpkQB/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,No caption needed customtshirts hoodies blacke...
8,black shirts,2020-12-16 21:00:52,https://www.instagram.com/p/CI33npiggpX/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,BlackPill collection lien dans la bio Alexein ...


In [5]:
instaDF = pd.DataFrame(insta)
# Preprocess metadata
instaDF['processed_metadata'] = instaDF['description'].apply(lambda x: helper_functions.preprocess_metadata(x, True))
# Preprocess query
instaDF['query'] = instaDF['query'].apply(helper_functions.preprocess_metadata)
## Calculate a factor for query tokens that appear in metadata
keywords = instaDF.iloc[0]['query'].split()
instaDF['factor'] = instaDF['processed_metadata'].apply(lambda row: len(set([word for word in row.split() if word in keywords])) / len(keywords))
instaDF['factor']

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
5    1.0
6    1.0
7    1.0
8    1.0
Name: factor, dtype: float64

In [24]:
## Calculate a factor based on the cosine similarity of TFIDF transformation of the query terms and 
# the processed metadata using the fashion expert terminology as vocabulary
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 1))
vectorizer.fit_transform(fashion_att)
tfidf_vector = vectorizer.transform(instaDF['processed_metadata'])
query_vector = vectorizer.transform(instaDF['query'])

## Calculate cosine similarity
cosine_vector = cosine_similarity(query_vector[0].toarray(), tfidf_vector.toarray())
instaDF['cosine ranking score'] = np.hstack(cosine_vector).tolist() * instaDF['factor']



In [63]:
pd.DataFrame(insta)


Unnamed: 0,query,timestamp,url,imgURL,imageFilePath,title,description
0,black shirts,2020-12-17 07:53:08,https://www.instagram.com/p/CI5CRDSp5aD/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,Black and Blue Blackshirts ties Dapper beard w...
1,black shirts,2020-12-17 06:05:29,https://www.instagram.com/p/CI418jYBWDp/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,Lovely cotton linen tops Available in black pi...
2,black shirts,2020-12-17 04:50:08,https://www.instagram.com/p/CI4tUu3hiDV/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,Ative T Shirt Black Version Coronavirus Christ...
3,black shirts,2020-12-17 02:43:19,https://www.instagram.com/p/CI4ez25pWT_/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,blackshirts blackshort photography
4,black shirts,2020-12-17 01:39:03,https://www.instagram.com/p/CI4XdM1hrkX/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,Only printshop in town that can print on dark ...
5,black shirts,2020-12-16 23:07:03,https://www.instagram.com/p/CI4GD6qFA4B/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,shefashionapparel shefashionapparelg multibran...
6,black shirts,2020-12-16 22:05:06,https://www.instagram.com/p/CI3--HKnkKq/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,Shop link in bio blackshirt blackshirts croche...
7,black shirts,2020-12-16 21:40:09,https://www.instagram.com/p/CI38HaSpkQB/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,No caption needed customtshirts hoodies blacke...
8,black shirts,2020-12-16 21:00:52,https://www.instagram.com/p/CI33npiggpX/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,BlackPill collection lien dans la bio Alexein ...


In [61]:
instaDF = pd.DataFrame(insta)
instaDF['processed_metadata'] = instaDF['description'].apply(lambda x: helper_functions.preprocess_metadata(x, True))
# Preprocess query
instaDF['query'] = instaDF['query'].apply(lambda row: ' '.join(helper_functions.preprocess_words(row.split())))
keywords = instaDF.iloc[0]['query'].split()
instaDF['factor'] = instaDF['processed_metadata'].apply(lambda row: len(set([word for word in row.split() if word in keywords])) / len(keywords))

## Calculate a factor based on the cosine similarity of TFIDF transformation of the query terms and
# the processed metadata using the fashion expert terminology as vocabulary
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 1))
vectorizer.fit_transform(fashion_att)
tfidf_vector = vectorizer.transform(instaDF['processed_metadata'])
query_vector = vectorizer.transform(instaDF['query'])

## Calculate cosine similarity
cosine_vector = cosine_similarity(query_vector[0].toarray(), tfidf_vector.toarray())
instaDF['cosine ranking score'] = np.hstack(cosine_vector).tolist() * instaDF['factor']

## Calculate a factor based on Pinterest's recommendation (order of result parsing)
scaler = MinMaxScaler((0.1, 1))
instagram_score = scaler.fit_transform(np.arange(len(instaDF)).reshape(-1, 1))
instaDF.loc[instaDF.sort_values(by='timestamp', ascending=True).index, 'instagram score'] = instagram_score

## Calculate Final Ranking Score giving the cosine similarity factor a greater score than the 
# factor based on the Pinterest recommendation
instaDF['final score'] = (instaDF['cosine ranking score'] * 0.7) + (instaDF['instagram score'] * 0.3)
instaDF.sort_values(by ='final score', ascending=False, inplace=True)

In [37]:
instaDF['cosine ranking score']

0    0.733301
1    0.619856
2    0.793914
3    0.896446
4    0.466352
5    0.659564
6    0.578518
7    0.941552
8    0.896053
Name: cosine ranking score, dtype: float64

In [60]:
instaDF

Unnamed: 0,query,timestamp,url,imgURL,imageFilePath,title,description,processed_metadata,factor,cosine ranking score,instagram score,final score
3,black shirt,2020-12-17 02:43:19,https://www.instagram.com/p/CI4ez25pWT_/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,blackshirts blackshort photography,black shirt black short photography,1.0,0.896446,0.6625,0.826262
0,black shirt,2020-12-17 07:53:08,https://www.instagram.com/p/CI5CRDSp5aD/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,Black and Blue Blackshirts ties Dapper beard w...,black blue black shirt tie dapper beard watch ...,1.0,0.733301,1.0,0.813311
2,black shirt,2020-12-17 04:50:08,https://www.instagram.com/p/CI4tUu3hiDV/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,Ative T Shirt Black Version Coronavirus Christ...,ative shirt black version coronavirus christma...,1.0,0.793914,0.775,0.78824
7,black shirt,2020-12-16 21:40:09,https://www.instagram.com/p/CI38HaSpkQB/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,No caption needed customtshirts hoodies blacke...,caption need custom shirt hoodies black empowe...,1.0,0.941552,0.2125,0.722836
1,black shirt,2020-12-17 06:05:29,https://www.instagram.com/p/CI418jYBWDp/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,Lovely cotton linen tops Available in black pi...,lovely cotton linen top available black pink w...,1.0,0.619856,0.8875,0.700149
8,black shirt,2020-12-16 21:00:52,https://www.instagram.com/p/CI33npiggpX/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,BlackPill collection lien dans la bio Alexein ...,black pill collection lien bio alexei shirt no...,1.0,0.896053,0.1,0.657237
5,black shirt,2020-12-16 23:07:03,https://www.instagram.com/p/CI4GD6qFA4B/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,shefashionapparel shefashionapparelg multibran...,fashion apparel fashion apparel multibrand bou...,1.0,0.659564,0.4375,0.592945
6,black shirt,2020-12-16 22:05:06,https://www.instagram.com/p/CI3--HKnkKq/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,Shop link in bio blackshirt blackshirts croche...,shop link bio black shirt black shirt crochet ...,1.0,0.578518,0.325,0.502463
4,black shirt,2020-12-17 01:39:03,https://www.instagram.com/p/CI4XdM1hrkX/,https://instagram.fskg3-1.fna.fbcdn.net/v/t51....,D:\Documents(D)\Projects\GitHub-repos\ISSEL\Sc...,,Only printshop in town that can print on dark ...,print shop town print dark single shirt fracti...,1.0,0.466352,0.55,0.491446


In [53]:
np.arange(1, len(instaDF)+1)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [27]:
fashion_att

['babyboy',
 'babygirl',
 'boy',
 'child',
 'childrenswear',
 'female',
 'girl',
 'kidswear',
 'male',
 'man',
 'menswear',
 'woman',
 'womenwear',
 'accessory',
 'activewear',
 'dress',
 'occasionwear',
 'coatsandjackets',
 'overall',
 'pyjama',
 'short',
 'skirt',
 'sportswear',
 'suit',
 'swimwear',
 'top',
 'trouser',
 'underwear',
 'bespokesuit',
 'mensuit',
 'tuxedo',
 'blazer',
 'bomber',
 'cape',
 'cardigan',
 'coat',
 'denim',
 'fleece',
 'jacket',
 'kimono',
 'leatherjacket',
 'poncho',
 'puffer',
 'pufferjacket',
 'raincoat',
 'trenchcoat',
 'waistcoat',
 'ballgown',
 'bardotdress',
 'blackmididress',
 'bridaldress',
 'caftan',
 'cocktaildress',
 'eveninggown',
 'eventdress',
 'frilldress',
 'halterdress',
 'holidaydress',
 'lacedress',
 'minidress',
 'nightdress',
 'promdress',
 'sheerdress',
 'shirtdress',
 'skaterdress',
 'slipdress',
 'slitdress',
 'straplessdress',
 'summerdress',
 'weddingdress',
 'tank',
 'blouse',
 'bodice',
 'camisole',
 'hooded',
 'hoodie',
 'jumpe

In [25]:
cosine_vector

array([[0., 0., 0., 0.]])

In [26]:
cosine_similarity(query_vector.toarray(), tfidf_vector.toarray())

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [8]:
instaDF['query']

0    blackshirt
1    blackshirt
2    blackshirt
3    blackshirt
Name: query, dtype: object

In [None]:

instaDF = pd.DataFrame(insta)
# Preprocess metadata
instaDF['processed_metadata'] = instaDF['description'].apply(lambda x: helper_functions.preprocess_metadata(x, True))
# Preprocess query
instaDF['query'] = instaDF['query'].apply(lambda row: ' '.join(helper_functions.preprocess_words(row.split())))

## Calculate a factor for tokens that appear in metatdata
keywords = instaDF[0]['query']
instaDF['factor'] = instaDF['processed_metadata'].apply(
    lambda row: len(set([word for word in row if word in keywords])) / len(keywords))

## Calculate a factor based on the cosine similarity of TFIDF transformation of the query terms and
# the processed metadata using the fashion expert terminology as vocabulary
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 1))
vectorizer.fit_transform(fashion_att)
tfidf_vector = vectorizer.transform(instaDF['processed_metadata'])
query_vector = vectorizer.transform(instaDF['query'])

## Calculate cosine similarity
cosine_vector = cosine_similarity(query_vector[0], tfidf_vector)
instaDF['cosine ranking score'] = np.hstack(cosine_vector).tolist() * instaDF['factor']

## Calculate a factor based on Pinterest's recommendation (order of result parsing)
scaler = MinMaxScaler()
pinterest_score = scaler.fit_transform(np.arange(len(instaDF)).reshape(-1, 1))
instaDF.loc[instaDF.sort_values(by='timestamp', ascending=False).index, 'pinterest score'] = pinterest_score

## Calculate Final Ranking Score giving the cosine similarity factor a greater score than the
# factor based on the Pinterest recommendation
instaDF['final score'] = (instaDF['cosine ranking score'] * 0.7) + (instaDF['pinterest score'] * 0.3)
instaDF.sort_values(by='final score', ascending=False, inplace=True)

# Save ranked results to the database
for _, row in instaDF.iterrows():
    site = 'Instagram'
    searchwords = ''.join(keywords.split())
    imageFilePath = row['imageFilePath']
    url = row['URL']
    imgURL = row['imgURL']
    empPhoto = helper_functions.getImage(imgURL, imageFilePath)
    head = row['title']
    meta = row['description']
    helper_functions.addNewProduct(site,
                                   searchwords,
                                   imageFilePath,
                                   empPhoto,
                                   url,
                                   imgURL,
                                   head,
                                   None,
                                   None,
                                   None,
                                   meta,
                                   None,
                                   None)


In [73]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
import string
import wordsegment
wordsegment.load()

# Define stop words
STOP_WORDS = set(nltk.corpus.stopwords.words('english'))
STOP_WORDS = STOP_WORDS.union(set(nltk.corpus.stopwords.words('italian')),
                              set(nltk.corpus.stopwords.words('german')),
                              set(nltk.corpus.stopwords.words('french')),
                              set(nltk.corpus.stopwords.words('spanish')))
# Removed 'man', included in german stop_words as it is an English word
STOP_WORDS.remove('man')
# Add 'via' in stop_words
STOP_WORDS.add('via')

def lemmatize(token, pos_tag):
    lemmatizer = WordNetLemmatizer()
    tag = {'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ}.get(pos_tag[0], wn.NOUN)
    return lemmatizer.lemmatize(token, tag)

def preprocess_metadata(doc, segmentation=False):
    # Convert to lowercase
    doc = doc.lower()
    # Remove URLs
    doc = re.sub(r'(www\S+)*(.\S+\.com)', '', doc)
    # Word segmentation, used for compound words, hashtags and spelling errors
    if segmentation:
        doc = ' '.join(wordsegment.segment(doc))
    # Remove punctuation
    doc = re.sub('[' + re.escape(string.punctuation) + ']+', ' ', doc)
    # Remove two letter words
    doc = ' '.join([word for word in doc.split() if len(word)>2])
    # Remove numbers and words with number
    doc = re.sub(r'([a-z]*[0-9]+[a-z]*)', '', doc)
    # Remove non-ASCII characters 
    doc = str(doc).encode("ascii", errors="ignore").decode()
    # Remove excess whitespace
    doc = re.sub(r'\s+', ' ', doc)
    # Remove stop words
    doc = ' '.join([word for word in doc.split() if word not in STOP_WORDS])
    
    # Tokenize
    tokenizer = TweetTokenizer(reduce_len=True)
    tokens = tokenizer.tokenize(doc)
    # Lemmatize
    tokens = [lemmatize(word, tag) for word,tag in pos_tag(tokens)]
    # Merge together
    return ' '.join(tokens)