# Import Packages and Data

In [11]:
# basics
import pandas as pd 
import numpy as np

# files
import glob
import pickle
import os
import requests
import sys

# text
import re
import string

# nlp
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/labbot/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
# all the data scraped using twitter_scraping_utils is saved as pickle files; extract and save as a dataframe
path = '/Users/labbot/Documents/metis_bootcamp/project05/twitter_data'
all_files = glob.glob(path + "/*.pickle")

li = []

for filename in all_files:
    one_df = pd.read_pickle(filename)
    li.append(one_df)

df_raw = pd.concat(li, axis=0, ignore_index=True)
df_raw.shape

(2415468, 7)

In [7]:
# I wasn't able to exclude gifs in tweepy query, so remove them now
df = df[df['media_type'] == 'photo']

# filter to only images that contain alt text
df = df[df['alt_text'].notnull()]
df.shape

(5967, 7)

# Preprocess alt text to use for modeling
Glancing through, it (unsurprisingly) looks like the user-created alt text in my corpus will need a fair bit of cleaning to be generalizable. These are the preprocessing steps I'll take:
1. remove "junk" alt text: pre-populated text like "discord image", "something went wrong", etc.
2. remove photos of text (I want to exclude these because most screen readers have OCR built-in, so my model should focus on other types of images)
3. standardize text, removing uppercase and punctuation
4. remove the text "photo of" or "picture of", since alt text best practice is not to include that
5. remove any captions that are too long or too short after completing the above steps

## Remove records with junk alt text

In [8]:
# remove pre-populated, uninformative captions
# which of these appear frequently?
df['alt_text'].value_counts().head(7)

User report                                                                                                                                                                                                                                                                                                                                                             262
Something went wrong                                                                                                                                                                                                                                                                                                                                                    248
Discord Image                                                                                                                                                                                                                                                                   

Interesting. It looks like most of these are junk, only one would be considered "good" alt text (brief, descriptive, natural language). Are they all the same image?

In [9]:
df[df['alt_text']=='Small flowers in a planter on a sunny balcony, blossoming.']

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type
182004,1334180798027718658,2020-12-02 17:01:04,"Congrats to $ALGO, the Biggest Green Dildo of ...",https://t.co/NEx3ZGT7fs,http://pbs.twimg.com/media/EoP2XSkW4AAlI7Y.png,"Small flowers in a planter on a sunny balcony,...",photo
205463,1333833309126860800,2020-12-01 18:00:16,"The Biggest Green Dildo of the Day: $SUSHI, +1...",https://t.co/vvaWqLUxtD,http://pbs.twimg.com/media/EoK6Uy-W4AI8aRj.png,"Small flowers in a planter on a sunny balcony,...",photo
317926,1333607122202869762,2020-12-01 03:01:29,"Congrats to $QNT, the Biggest Green Dildo of t...",https://t.co/umrX9p6dUD,http://pbs.twimg.com/media/EoHsm-YXMAAp7_h.png,"Small flowers in a planter on a sunny balcony,...",photo
361878,1334331809526673408,2020-12-03 03:01:08,"Congrats to $SUSHI, the Biggest Green Dildo of...",https://t.co/HuCaO3bKao,http://pbs.twimg.com/media/EoR_tV4W8AAJyqV.png,"Small flowers in a planter on a sunny balcony,...",photo
595889,1334528062520758272,2020-12-03 16:00:59,"Congrats to $AAVE, the Biggest Green Dildo of ...",https://t.co/RULWz9Y9A4,http://pbs.twimg.com/media/EoUyMxaXcAYKV4Z.png,"Small flowers in a planter on a sunny balcony,...",photo
847141,1334377106093051904,2020-12-03 06:01:08,"Congrats to $LUNA, the Biggest Green Dildo of ...",https://t.co/8hp5hJsbiw,http://pbs.twimg.com/media/EoSo58RW4AAs45r.png,"Small flowers in a planter on a sunny balcony,...",photo
999454,1334361981638365185,2020-12-03 05:01:02,"Congrats to $LUNA, the Biggest Green Dildo of ...",https://t.co/HIygT9Skbk,http://pbs.twimg.com/media/EoSbJhSXEAEMCRB.png,"Small flowers in a planter on a sunny balcony,...",photo
1077086,1334663933802770435,2020-12-04 01:00:53,"Congrats to $SC, the Biggest Green Dildo of th...",https://t.co/t66eg4BAsG,http://pbs.twimg.com/media/EoWtxfvXMAIiG3u.png,"Small flowers in a planter on a sunny balcony,...",photo
1349512,1335026328890986496,2020-12-05 01:00:55,"Congrats to $XEM, the Biggest Green Dildo of t...",https://t.co/dkYTU3OXYJ,http://pbs.twimg.com/media/Eob3XsEXcAURKgf.png,"Small flowers in a planter on a sunny balcony,...",photo
1506586,1335011288058130436,2020-12-05 00:01:09,"Congrats to $SC, the Biggest Green Dildo of th...",https://t.co/YG4DqP8aZf,http://pbs.twimg.com/media/EobpsLZXIAc21Ne.png,"Small flowers in a planter on a sunny balcony,...",photo


oooooohkay. Definitely not a good caption; the text is not at all relevant to the image. Not sure why this image has such a mismatched alt text (these tweets look spammy), but hopefully this isn't too common in my dataset.

Based on this, I'm going to make the assumption that any caption that's appearing more than once in my dataset is likely junk, and just get rid of all of those.

In [10]:
vc = df['alt_text'].value_counts()
recurrent_alt_texts = list(vc[vc > 1].index)
df = df[~df['alt_text'].isin(recurrent_alt_texts)]
df.shape

# save this current version for topic modeling before removing the images of text
# (I want images of text to be included in topic modeling and clustering, but not in the training dataset the captioning model)
df.to_pickle('topic_modeling_dataset.pickle')

## Remove images of text
For V1, I'm going to identify/remove records based on keywords in the alt text that indicate that text is an important part of the image. 

A future version could use OCR to filter these out, although I think I do want to keep images where text is a small part of the image (eg small caption), so it would ideally be an implementation like `if text area < 15% of total then keep`.

In [9]:
text_keywords = ['reads','says','text','screenshot','\n','screen shot', 'screencap','screen cap','labeled',
                 'username','tweet', 'twitter', 'article', 'receipt',':','-','png','jpg','jpeg','metadata',
                 '@','#','.com','success by','bytes','json', 'success from']


df = df[~df['alt_text'].apply(lambda r: any([kw in r.lower() for kw in text_keywords]))]
df.sample(5)

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type
745224,1334519271653842947,2020-12-03 15:26:03,"Wrote about Medusa, what a trans story is, and...",https://t.co/RU4LJ3xeh8,http://pbs.twimg.com/media/EoUqNEAXcAI_MqB.jpg,"Bronze statue of Medusa, the woman from myth w...",photo
1528783,1335009276436340741,2020-12-04 23:53:09,I love [pseudonym]! https://t.co/GzDrynmVYf,https://t.co/GzDrynmVYf,http://pbs.twimg.com/media/Eobn3BtXcAEaEQ_.jpg,An email from Spotify in effort to get me to c...,photo
600823,1334527824342908929,2020-12-03 16:00:02,homagium https://t.co/RjKCPtEaMf,https://t.co/RjKCPtEaMf,http://pbs.twimg.com/media/EoUx-q8VoAUFdSQ.png,homagium,photo
191102,1334180298117046272,2020-12-02 16:59:05,uncommon complex https://t.co/TBQ9LGRN3v,https://t.co/TBQ9LGRN3v,http://pbs.twimg.com/media/EoP16P-XMAAKDns.png,uncommon complex,photo
2203930,1333805808258322435,2020-12-01 16:11:00,Did You Know...\n\n#TriviaTuesday\n#Monotremes...,https://t.co/uIGv6VThW0,http://pbs.twimg.com/media/EnsFaZiXMAcCI9W.jpg,"Monotremes are the only mammals that lay eggs,...",photo


In [10]:
df.shape

(3005, 7)

## Lowercase and remove punctuation

In [12]:
# create field "alt text clean" with text all-lowercase and no punctuation
df['alt_text_clean'] = df['alt_text'].str.replace('[^a-zA-Z ]+', '').str.lower()
df.sample(2)

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type,alt_text_clean
993423,1334362559475015683,2020-12-03 05:03:20,@HollytWolf @kayyybearxo @PaulHillierdesu A co...,https://t.co/QFBpKBqnYf,http://pbs.twimg.com/media/EoSbrNSWMAAPA2j.jpg,Nux from Mad Max: Fury Road with chrome paint ...,photo,nux from mad max fury road with chrome paint a...
2136153,1334698891481059331,2020-12-04 03:19:47,@OrcaTheGreat @mmburton My immediate thought w...,https://t.co/r2T5Hsx70r,http://pbs.twimg.com/media/EoXNaLBW8AM_4Jv.jpg,4 screenshots from 30 Rock of Chris telling Li...,photo,screenshots from rock of chris telling liz i...


In [13]:
# create "alt text cleaner" with same as above, but also remove any words that don't exist in nltk english corpus
# this ends up being too agressive of a removal strategy, getting rid of tons of common words, like "laptop"
# I also played around with using SpaCy to identify and remove proper nouns, but it wasn't do a good job of identifying them
words = nltk.corpus.words.words()
df['alt_text_cleaner'] = [' '.join(y for y in x.split() if y in words) for x in df['alt_text_clean']]

In [14]:
df.sample(3)

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type,alt_text_clean,alt_text_cleaner
1369997,1335024385724280832,2020-12-05 00:53:11,@deccybb @michaeljackson my whole house smells...,https://t.co/kcZTGwjOMz,http://pbs.twimg.com/media/Eob1mnjUcAAlSyU.jpg,beef cat,photo,beef cat,beef cat
706119,1334521714173218818,2020-12-03 15:35:45,"Myself and Adam in lego form are filming, but ...",https://t.co/C8MQjsZIzv,http://pbs.twimg.com/media/EoUsbMCXIAcU6LU.jpg,Two lego figures stand against a tissue paper ...,photo,two lego figures stand against a tissue paper ...,two stand against a tissue paper background an...
980271,1334363739588595714,2020-12-03 05:08:01,🎄Christmassy critters🎄 https://t.co/fNINobenUq,https://t.co/fNINobenUq,http://pbs.twimg.com/media/EoSctSPXYAcEwG4.jpg,"Black cat, Hamilton, perched on the arm of the...",photo,black cat hamilton perched on the arm of the c...,black cat on the arm of the couch in front of ...


## Remove 'photo/picture of'

In [15]:
df['alt_text_clean'] = df['alt_text_clean'].str.replace('photo of ','')
df['alt_text_clean'] = df['alt_text_clean'].str.replace('picture of ','')

## Remove records with alt text that's too long or too short

In [16]:
# check out the current caption length
df['alt_text_clean_len'] = df['alt_text_clean'].str.split(" ").str.len()
df['alt_text_clean_len'].describe()

count    4881.000000
mean       16.922352
std        22.183556
min         1.000000
25%         5.000000
50%        10.000000
75%        19.000000
max       230.000000
Name: alt_text_clean_len, dtype: float64

In [17]:
# did a bunch of eyeballing in a spreadsheet, and captions with more than 5/fewer than 36 words seemed to be a reasonable quality cutoff
df = df[(df['alt_text_clean_len'] > 5) & (df['alt_text_clean_len'] < 36)]
df.shape

(2861, 10)

In [18]:
# save dataframe to pickle file for modeling
df.to_pickle('twitter_alt_text.pkl')

# Download all images for image caption model

In [19]:
urls = list(df['img_url'])

In [20]:
counter = 0
for i in urls:
     try:
        filename = 'twitter_images/'+i[27:46]
        response = requests.get(i)
        file = open(filename, "wb")
        file.write(response.content)
        file.close()
        if counter % 50 == 0:
            print(f'{counter} images downloaded.')
        counter +=1
     except:
         pass

# Download a random sample of images for clustering analysis

In [19]:
rand550 = df_raw.sample(550)

In [122]:
urls = list(rand550['img_url'])

counter = 0
for i in urls:
     try:
        filename = 'clustering_images/'+i[27:46]
        response = requests.get(i)
        file = open(filename, "wb")
        file.write(response.content)
        file.close()
        if counter % 50 == 0:
            print(f'{counter} images downloaded.')
        counter +=1
     except:
         pass

0 images downloaded.
50 images downloaded.
100 images downloaded.
150 images downloaded.
200 images downloaded.
250 images downloaded.
300 images downloaded.
350 images downloaded.
400 images downloaded.
450 images downloaded.
500 images downloaded.
