### Importing Libraries

In [1]:
import time, numpy as np, pandas as pd

import json

import psycopg2 as pg2
from psycopg2.extras import RealDictCursor, Json

### Collecting Fashion House Tweets

In [2]:
#Reading in application keys 

with open('../data/twitter_keys.txt') as api:
    keys = api.read().split('\n') 

#Aplication keys

import twitter, re, datetime, pandas as pd

twitter_keys = {
    'consumer_key':        keys[0],
    'consumer_secret':     keys[1],
    'access_token_key':    keys[2],
    'access_token_secret': keys[3]
}

api = twitter.Api(
    consumer_key         =   twitter_keys['consumer_key'],
    consumer_secret      =   twitter_keys['consumer_secret'],
    access_token_key     =   twitter_keys['access_token_key'],
    access_token_secret  =   twitter_keys['access_token_secret'],
    tweet_mode           =   'extended'
)

In [3]:
class TweetMiner():
    
    result_limit = 20
    api          = None
    tweets       = []
    
    def __init__(self, keys_dict, api, result_limit = 20):
        
        self.keys_dict     = keys_dict
        self.api           = api
        self.results_limit = result_limit
        
    def mine_user_tweets(self, user='gucci', mine_retweets = False, max_pages = None, last_tweet_id = False):
        
        #time out feedback
        print("Mining tweets for:", user)
        
        tweets        = []
        last_tweet_id = False
        page           = 1
        
        while page <= max_pages:
            
            #time out feedback
            print("Mining page:", page)
            
            if last_tweet_id:
                statuses = self.api.GetUserTimeline(
                    screen_name = user,
                    count       = self.result_limit,
                    max_id      = last_tweet_id - 1
                )
            else:
                statuses = self.api.GetUserTimeline(
                    screen_name = user,
                    count       = self.result_limit
                )

            for item in statuses:
                
                last_tweet_id = item.id
                
                tweets.append({
                    'tweet_id': item.id,
                    'handle':   item.user.name,
                    'text':     item.full_text,
                    'created':  item.created_at
                })
                
            page += 1
        
        return tweets
            
        
miner = TweetMiner(twitter_keys, api, result_limit=50)

### Fashion Labels

In [4]:
wikipedia_labels = ['akris_official', 'McQueen', 'alfreddunhill', 'AlphaIndustries', 'andres_sarda', 'Anne_Fontaine',\
                   'aquascutum', 'giorgioarmani', 'emporioarmani', 'ArmaniExchange', 'AspinalofLondon', 'AspreyLondon',\
                   'AleDellAcqua', 'Dior', 'dkny', 'dolcegabbana', '_BadgleyMischka', 'BadgleyMischka', 'BALENCIAGA',\
                   'Bally', 'Balmain', 'BensonAndClegg', 'berluti', 'Beulahlondon', 'ManoloBlahnik', 'HUGOBOSS',\
                    'BottegaVeneta', 'ThomBrowneNY', 'Bulgariofficial', 'Burberry', 'NigelCabourn', 'CalvinKlein',\
                   'camillaandmarc', 'Cartier', 'Dsquared2', 'CERRUTI1881', 'CHANEL', 'Childrensalon', 'chloefashion',\
                   'matthewbridal', 'chromeheartsusa', 'ClementsRibeiro', 'Coach', 'COMMEGARCONS', 'Common Projects',\
                   'corneliajames', 'Corneliani_com', 'CuteCircuit', 'cutlerandgross', 'DegeandSkinner', 'NUMEROVENTUNO',\
                   'dkny', 'dolcegabbana', 'ermannoscervino', 'ESCADA', 'EtroOfficial', 'Faconnable', 'Fendi',\
                    'TOMFORD', 'franksorbier', 'freywilleglobal', 'STUDIO_FULTON', 'DVF', 'JPGaultier', 'GievesLondon',\
                   'givenchy', 'Goyard', 'gucci', 'HackettLondon', 'KevanHallDesign', 'halston', 'Hardy Amies ', 'HELMUTLANG',\
                   'Hermes_Paris', 'HouseofHerrera', 'TommyHilfiger', 'SherriHill', 'GeorgesHobeika', 'marcjacobs', 'jimmychoo',\
                   'johnvarvatos', 'ChristopherKane', 'Karen_Kane', 'katespadeny', 'MaryKatrantzou', ' Khaadi', 'LACOSTE',\
                   'BNYhandbags', 'KarlLagerfeld', 'DerekLamNYC', 'LANVINofficial', 'LardiniOfficial', 'guylaroche', 'Larusmiani',\
                   'JudithLeiber', 'M_Lhuillier', '31philliplim', 'DanLiuTatsuaki', 'Longchamp', 'LouboutinWorld',\
                   'Luxottica', 'BrunoMagli', 'Margiela', 'MarchesaFashion', 'Marimekkoglobal', 'marniofficial',\
                   'maxmara', 'StellaMcCartney', 'MCMtweets', 'MichaelKors', 'Missoni', 'MIUMIUofficial', 'Moschino',\
                   'mouawadjewelry', 'MoynatParis', 'Mugler', 'MulberryEngland', 'charlottes_web', 'RICKOWENSONLINE',\
                   'PauleKa', 'philipp_plein', 'Prada', 'PringleScotland', 'EmilioPucci', 'PacoRabanne', 'Raja Fashions',\
                   'ralphandrusso', 'RalphLauren', 'OscardelaRenta', 'NinaRicci', 'Roberto_Cavalli', 'OfficialRodarte',\
                   'narcisostudio', 'RolandMouret', 'sergiorossi', 'MrRalphRucci', 'soniarykiel', 'sehkelly', 'Ferragamo',\
                   'sass_and_bide', 'SatyaPaulIndia', 'ITSJEREMYSCOTT', 'shanghaitang', 'JohnSmedley', 'PaulSmithDesign',\
                   'Smythson', 'JILLSTUART_JP', 'annasui', 'VivienneTam', 'THEROW', 'tibi', 'TiffanyAndCo', 'toryburch',\
                   'TrussardiNews', 'tumblerandtipsy', 'Turnbull_Asser', 'MaisonValentino', 'DriesVanNoten', 'ALEXVAUTHIER',\
                   'Versace', 'Viktor_and_Rolf', 'LouisVuitton', 'AlexanderWangNY', 'VeraWangGang', 'FollowWestwood',\
                   'MWWorld', 'JasonWu', 'thewunderkindco', 'YSL', 'giuseppezanotti', 'Zegna', 'ZARA']


fashion_labels = ['gucci', 'McQueen', 'StellaMcCartney', 'Fendi', 'MaisonValentino', 'Roberto_Cavalli',\
                  'LouboutinWorld', 'EtroOfficial', 'Prada', 'dolcegabbana', 'Dior', 'LouisVuitton', 'CHANEL'\
                 'Missoni', 'OscardelaRenta', 'VeraWangGang', 'HerveLeger', 'MaisonValentino', 'Balmain',\
                 'Versace', 'giorgioarmani', 'DerekLamNYC', 'AlbertaFerretti', 'YSL', 'COMMEGARCONS', 'BALENCIAGA',\
                 'Burberry', 'TOMFORD', 'ALEXVAUTHIER', 'MIUMIUofficial', 'CalvinKlein', 'givenchy', 'TommyHilfiger',\
                 'jimmychoo', 'OffWht', 'chloefashion','dkny', 'marcjacobs', 'MichaelKors', 'Goyard', 'HUGOBOSS',\
                 'kenzo', 'HELMUTLANG', 'acnestudios', 'Bulgariofficial', 'akris_official']

prime_labels = ['gucci', 'McQueen', 'StellaMcCartney', 'Roberto_Cavalli', 'acnestudios', 'jimmychoo', 'Roberto_Cavalli',\
               'TOMFORD', 'Prada', 'akris_official']

print('wikipedia_labels', len(wikipedia_labels))
print('fashion_labels:', len(fashion_labels))
print('prime_labels:', len(prime_labels))

wikipedia_labels 172
fashion_labels: 45
prime_labels: 10


### Functions 

In [5]:
#function to connect to posgres db on ubuntu instance 

def con_cur_to_db():
    con = pg2.connect(host = '13.52.0.109', 
                     dbname = 'postgres',
                     user = 'postgres',
                     password = 'zoolander')
    cur = con.cursor(cursor_factory = RealDictCursor)
    return con, cur

In [6]:
#function to insert data into unstructured postgres db

def insert_json(data):
    con, cur = con_cur_to_db()
    for x in data:
        cur.execute('INSERT INTO all_data (data) VALUES ({});'.format(Json(x)))
    con.commit()
    con.close()

In [7]:
#grabs data from postgres db

def fetch_data(query):
    con, cur = con_cur_to_db()
    cur.execute(query)
    data = cur.fetchall()
    con.close()
    return data

In [8]:
#function to run sql queries in postgres db 

def get_handle_tweets(handle):
    query = f"""
    SELECT * 
    FROM all_data
    Where data->>'handle' = 
    '{handle}'; """
    data = fetch_data(query)
    return data

In [9]:
#transforms list of dicts of dicts to list of dicts 

def extractor(query_result):
    return [*map(lambda x: x['data'], query_result)]

### Extracting tweets & storing to postgres db

In [38]:

statuses = api.GetUserTimeline(
                    screen_name = 'akris_official',
                    count       = 100)

In [39]:
statuses


[Status(ID=971416620429127680, ScreenName=akris_official, Created=Wed Mar 07 16:05:45 +0000 2018, Text='Vienna 1900-1918. It was a groundbreaking time for women. Artists like Klimt &amp; Schiele portrayed the profound change in society, created reform dresses liberating women from the corset. Albert Kriemler translated this spirit into his collection. https://t.co/7kZ87mK6op https://t.co/7klnaOdfJO'),
 Status(ID=970652605578870784, ScreenName=akris_official, Created=Mon Mar 05 13:29:49 +0000 2018, Text='A closer look into our #AkrisFW18 collection, which features Marble inspired prints and… https://t.co/iFT9Y3do37'),
 Status(ID=970347792932761600, ScreenName=akris_official, Created=Sun Mar 04 17:18:36 +0000 2018, Text='One of our powerful looks from the #AkrisFW18 collection. The green Malachite Print dress - bold… https://t.co/ymSk7Xr3mz'),
 Status(ID=969942904851566592, ScreenName=akris_official, Created=Sat Mar 03 14:29:43 +0000 2018, Text="Bonjour Paris! \nArt and architecture play

In [29]:
#function to insert data into unstructured postgres db

def insert_to_fashion(data):
    con, cur = con_cur_to_db()
    for x in data:
        cur.execute('INSERT INTO fashion(data) VALUES ({});'.format(Json(x)))
    con.commit(
        try:
        # database transaction code
        except (sqlalchemy.exc.SQLAlchemyError, sqlalchemy.exc.DBAPIError) as e:
        # handle exception "e", or re-raise appropriately.
)
    con.close()

In [None]:
insert_to_fashion()

In [41]:
# scraper automates api call

def scrape_label(label):
    try:
        tweet_data = miner.mine_user_tweets(user=label, max_pages=200, last_tweet_id=True)
        insert_to_fashion(tweet_data)
    except:
        print('Timed out on:', label)
        with open('../logfile.txt', 'a+') as f:
            f.write(f'Timed out on {label} at {time.ctime()}\n')
        time.sleep(915)
        scrape_label(label)
        
for label in wikipedia_labels:
    scrape_label(label) 

Timed out on: akris_official


KeyboardInterrupt: 

### Extracting data from postgres db

In [None]:
#function to run sql queries in postgres db 

def get_fashion_tweets(handle):
    query = f"""
    SELECT * 
    FROM fashion
    Where data->>'handle' = 
    '{handle}'; """
    data = fetch_data(query)
    return data

In [19]:
# Extracting data from db and creating a data frame

labels = ['Acne Studios', 'Akris', 'Alexander McQueen', 'gucci', 'Jimmy Choo', 'PRADA', 'Roberto Cavalli',\
'Stella McCartney', 'TOM FORD'] 

data = []

for label in labels:
    data.extend(get_handle_tweets(label))
    
extracted_data = extractor(data)

fashion = pd.DataFrame(extracted_data)
fashion.shape

(17518, 4)

In [20]:
fashion.handle.value_counts()

gucci                3240
Stella McCartney     2000
Roberto Cavalli      2000
Acne Studios         2000
Alexander McQueen    2000
Jimmy Choo           1999
PRADA                1991
TOM FORD             1290
Akris                 998
Name: handle, dtype: int64

In [None]:
#################################### dave function 

items = range(1, 500)

# write a function that will capture items in chunks of 10 from 5000, decending
bucket = []

def grab_10_items():
    
    # what is the lowest number in bucket so far?
    #start from there
    
    min_bucket = min(bucket)
    
    if items[min_bucket:-10] in bcuket:
        bucket.append(the stuff you find..)

grab_10_items()
