# Import Packages and Data

In [13]:
# basics
import pandas as pd 
import numpy as np

# files
import glob
import pickle
import os
import requests

# ocr
import pytesseract as pytesseract
from PIL import Image

# nlp
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
import demoji

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/labbot/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [14]:
# all the data scraped using twitter_scraping is saved as pickle files; extract and save as a dataframe
path = './twitter_data'
all_files = glob.glob(path + "/*.pickle")

li = []

for filename in all_files:
    one_df = pd.read_pickle(filename)
    li.append(one_df)

df_raw = pd.concat(li, axis=0, ignore_index=True)
df_raw.shape

(2415468, 7)

# Analysis of alt text prevalence

In [15]:
# wasn't able to exclude gifs in tweepy query, so remove them now
df = df_raw[df_raw['media_type'] == 'photo']
df.shape

(1985705, 7)

In [16]:
# how many photos have alt text?
alt_text_count = sum(df['alt_text'].notnull())
total_photos = len(df)
alt_text_rate = alt_text_count/total_photos

print(f'{alt_text_rate:%} of photos in the sample of {total_photos} have user-provided alt text.')
print(f"that's {alt_text_count} photos with (uncleaned) alt text.")

0.300498% of photos in the sample of 1985705 have user-provided alt text.
that's 5967 photos with (uncleaned) alt text.


In [17]:
# explore some of the alt text available
df_alt_text = df[df['alt_text'].notnull()]
df_alt_text.sample(20)

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type
1518204,1335010289801113600,2020-12-04 23:57:11,First day on the slopes! 😀 ⛷ \n\n#HappyPlace #FirstDay #WinterPark https://t.co/oAWQeRqS7L,https://t.co/oAWQeRqS7L,http://pbs.twimg.com/media/EobovuoU0AASBWX.jpg,Selfie with mask,photo
819795,1334380191561502720,2020-12-03 06:13:23,Early Morning Natural Vitamin E and Zinc Dose\nhttps://t.co/PBffW244g7 https://t.co/IFaCayJU2r,https://t.co/IFaCayJU2r,http://pbs.twimg.com/media/EoSrrhqXMAIh0UO.jpg,Almonds - Early Morning Natural Vitamin E and Zinc Dose,photo
1508304,1335011148849180678,2020-12-05 00:00:35,Last chance to submit for #paid Stand-In work in Atlanta! https://t.co/vIyQK0LhxH https://t.co/ufnwBQU5Cl,https://t.co/ufnwBQU5Cl,http://pbs.twimg.com/media/EobpkLKXMAE7h9Q.jpg,"man, portrait, gloomy",photo
538106,1334531041051238405,2020-12-03 16:12:49,"""What kind of letter should we write to him?"" #WinnieThePooh #Pooh #WinnielOurson #LearnFrench https://t.co/fuT6NfE0eY",https://t.co/fuT6NfE0eY,http://pbs.twimg.com/media/EoU0tH-XMAA-4he.jpg,Quelle sorte de lettre devons-nous lui écrire?,photo
29404,1333855233802383361,2020-12-01 19:27:24,Elective surgery of the spine. https://t.co/AlEDpp2aKp https://t.co/HAFCEfLevL,https://t.co/HAFCEfLevL,http://pbs.twimg.com/media/EoLOQ_aXEAcjsWJ.jpg,Before and after x-rays of scoliosis and spinal fusion surgery.,photo
600823,1334527824342908929,2020-12-03 16:00:02,homagium https://t.co/RjKCPtEaMf,https://t.co/RjKCPtEaMf,http://pbs.twimg.com/media/EoUx-q8VoAUFdSQ.png,homagium,photo
2055169,1334705884807749634,2020-12-04 03:47:35,another one... https://t.co/QOwOZqeCZw,https://t.co/QOwOZqeCZw,http://pbs.twimg.com/media/EoXT7Y-XcAEzmqw.jpg,A photo of skinny legend Quackity,photo
1399032,1335021551884840960,2020-12-05 00:41:56,Shared from Photos app\n1 photo\nhttps://t.co/SMhiZL6hY1 https://t.co/ISkjUWLofV,https://t.co/ISkjUWLofV,http://pbs.twimg.com/media/EobyrnBXMAAlIM1.jpg,Hanging out on the Line Creek Trail.,photo
146310,1334183016554520582,2020-12-02 17:09:53,Success by edwin4233#1708 in icooknyc https://t.co/6coRicKLUj,https://t.co/6coRicKLUj,http://pbs.twimg.com/media/EoP4YaOW4AAqhhn.jpg,Discord Image,photo
2372033,1334259455899398145,2020-12-02 22:13:38,this is a cursed tradition https://t.co/oBrZ4umzUb,https://t.co/oBrZ4umzUb,http://pbs.twimg.com/media/EoQ9ZpdXYAAMKlV.jpg,"google search ""does spotify wrapped include december or do I have to live with my shame""",photo


In [18]:
pd.options.display.max_colwidth = 250

# Preprocess alt text to use for modeling
Glancing through, it (unsurprisingly) looks like the user-created alt text in my corpus will need a fair bit of cleaning to be generalizable. These are the preprocessing steps I'll take:
1. remove pre-populated stock text like "discord image", "something went wrong", etc.
2. remove photos of text (I'm not trying to create an OCR model)
3. remove proper nouns
4. remove "photo of" or "picture of", since those could confuse the model and make for bad alt text anyways
5. remove any captions that are too short the above steps

## Remove records with junk alt text

In [19]:
# remove pre-populated, uninformative captions
# which of these appear frequently?
df_alt_text['alt_text'].value_counts().head(7)

User report                                                                                                                                                                                                                                                                                                                                                             262
Something went wrong                                                                                                                                                                                                                                                                                                                                                    248
Discord Image                                                                                                                                                                                                                                                                   

Interesting. It looks like most of these are junk, only one would be considered "good" alt text (brief, descriptive, natural language). Are they all the same image?

In [20]:
df_alt_text[df_alt_text['alt_text']=='Small flowers in a planter on a sunny balcony, blossoming.']

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type
182004,1334180798027718658,2020-12-02 17:01:04,"Congrats to $ALGO, the Biggest Green Dildo of the Hour, +3.4% --&gt; https://t.co/Wt2ia9frux #ALGO #crypto https://t.co/NEx3ZGT7fs",https://t.co/NEx3ZGT7fs,http://pbs.twimg.com/media/EoP2XSkW4AAlI7Y.png,"Small flowers in a planter on a sunny balcony, blossoming.",photo
205463,1333833309126860800,2020-12-01 18:00:16,"The Biggest Green Dildo of the Day: $SUSHI, +19.8% --&gt; https://t.co/Wt2ia9frux #SUSHI #crypto https://t.co/vvaWqLUxtD",https://t.co/vvaWqLUxtD,http://pbs.twimg.com/media/EoK6Uy-W4AI8aRj.png,"Small flowers in a planter on a sunny balcony, blossoming.",photo
317926,1333607122202869762,2020-12-01 03:01:29,"Congrats to $QNT, the Biggest Green Dildo of the Hour, +6% --&gt; https://t.co/Wt2ia8XQ5X #QNT #crypto https://t.co/umrX9p6dUD",https://t.co/umrX9p6dUD,http://pbs.twimg.com/media/EoHsm-YXMAAp7_h.png,"Small flowers in a planter on a sunny balcony, blossoming.",photo
361878,1334331809526673408,2020-12-03 03:01:08,"Congrats to $SUSHI, the Biggest Green Dildo of the Hour, +3.8% --&gt; https://t.co/Wt2ia9frux #SUSHI #crypto https://t.co/HuCaO3bKao",https://t.co/HuCaO3bKao,http://pbs.twimg.com/media/EoR_tV4W8AAJyqV.png,"Small flowers in a planter on a sunny balcony, blossoming.",photo
595889,1334528062520758272,2020-12-03 16:00:59,"Congrats to $AAVE, the Biggest Green Dildo of the Hour, +7.2% --&gt; https://t.co/Wt2ia9frux #AAVE #crypto https://t.co/RULWz9Y9A4",https://t.co/RULWz9Y9A4,http://pbs.twimg.com/media/EoUyMxaXcAYKV4Z.png,"Small flowers in a planter on a sunny balcony, blossoming.",photo
847141,1334377106093051904,2020-12-03 06:01:08,"Congrats to $LUNA, the Biggest Green Dildo of the Hour, +0.9% --&gt; https://t.co/Wt2ia9frux #LUNA #crypto https://t.co/8hp5hJsbiw",https://t.co/8hp5hJsbiw,http://pbs.twimg.com/media/EoSo58RW4AAs45r.png,"Small flowers in a planter on a sunny balcony, blossoming.",photo
999454,1334361981638365185,2020-12-03 05:01:02,"Congrats to $LUNA, the Biggest Green Dildo of the Hour, +2.6% --&gt; https://t.co/Wt2ia9frux #LUNA #crypto https://t.co/HIygT9Skbk",https://t.co/HIygT9Skbk,http://pbs.twimg.com/media/EoSbJhSXEAEMCRB.png,"Small flowers in a planter on a sunny balcony, blossoming.",photo
1077086,1334663933802770435,2020-12-04 01:00:53,"Congrats to $SC, the Biggest Green Dildo of the Hour, +24.6% --&gt; https://t.co/Wt2ia9frux #SC #crypto https://t.co/t66eg4BAsG",https://t.co/t66eg4BAsG,http://pbs.twimg.com/media/EoWtxfvXMAIiG3u.png,"Small flowers in a planter on a sunny balcony, blossoming.",photo
1349512,1335026328890986496,2020-12-05 01:00:55,"Congrats to $XEM, the Biggest Green Dildo of the Hour, +6.4% --&gt; https://t.co/Wt2ia9frux #XEM #crypto https://t.co/dkYTU3OXYJ",https://t.co/dkYTU3OXYJ,http://pbs.twimg.com/media/Eob3XsEXcAURKgf.png,"Small flowers in a planter on a sunny balcony, blossoming.",photo
1506586,1335011288058130436,2020-12-05 00:01:09,"Congrats to $SC, the Biggest Green Dildo of the Hour, +3.8% --&gt; https://t.co/Wt2ia9frux #SC #crypto https://t.co/YG4DqP8aZf",https://t.co/YG4DqP8aZf,http://pbs.twimg.com/media/EobpsLZXIAc21Ne.png,"Small flowers in a planter on a sunny balcony, blossoming.",photo


oooooohkay. Definitely not a good caption. Not sure why this image has such a mismatched alt text (these tweets look spammy), but hopefully this isn't too common in my dataset.

Based on this, I'm going to make the assumption that any caption that's appearing more than once in my dataset is likely junk, and just get rid of all of those.

In [21]:
vc = df_alt_text['alt_text'].value_counts()
recurrent_alt_texts = list(vc[vc > 1].index)
df_alt_text = df_alt_text[~df_alt_text['alt_text'].isin(recurrent_alt_texts)]
df_alt_text.shape
df_tm = df_alt_text

## Remove images of text
For V1, I'm going to identify/remove records based on keywords in the alt text that indicate that text is an important part of the image. 

A future version could use OCR, although I think I do want to keep images where text is a small part of the image (eg small caption), so it would ideally be an implementation like `if text area < 15% of total then keep`.

In [22]:
text_keywords = ['reads','says','text','screenshot','\n','screen shot', 'screencap','screen cap','labeled',
                 'username','tweet', 'twitter', 'article', 'receipt',':','-','png','jpg','jpeg','metadata',
                 '@','#','.com','success by','bytes','json', 'success from']


df_alt_text = df_alt_text[~df_alt_text['alt_text'].apply(lambda r: any([kw in r.lower() for kw in text_keywords]))]
df_alt_text.sample(10)

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type
812528,1334381015641583617,2020-12-03 06:16:40,@ADHDeanASL @prufrockluvsong Basically how I feel https://t.co/e0A4z1G1pk,https://t.co/e0A4z1G1pk,http://pbs.twimg.com/media/EoSsdgeW8AAFiOs.jpg,Car with its rear bumper held on with duct tape,photo
925436,1334368832119578630,2020-12-03 05:28:15,@santaklaudss Hey how are you what you doing up https://t.co/jCaxVxlGWE,https://t.co/jCaxVxlGWE,http://pbs.twimg.com/media/EoShQLaWMAAbbSQ.jpg,There dog for you,photo
2105016,1334701519044796426,2020-12-04 03:30:14,New keyboard for my new desk. \n by /u/Kreydor \n \n https://t.co/wUtTZuy0ET https://t.co/o5fPp55lFs,https://t.co/o5fPp55lFs,http://pbs.twimg.com/media/EoXP9HAW4AEYV01.jpg,New keyboard for my new desk.,photo
2136065,1334698897063763968,2020-12-04 03:19:49,"love a fun, relaxing, distracting night playing board games https://t.co/8gRoOWaMor",https://t.co/8gRoOWaMor,http://pbs.twimg.com/media/EoXNkhMW4AQOArd.jpg,pic of board game called pandemic,photo
397813,1334328615224283137,2020-12-03 02:48:27,for only 160 usd you can get the same razor mouse you already have but yellow and presented over a pond of piss https://t.co/N1BIk5FXXD,https://t.co/N1BIk5FXXD,http://pbs.twimg.com/media/EoR8za6UYAUMTBq.jpg,A primarily yellow gaming mouse sitting on a stand that's pictured over a radiating pool of yellow that's superimposed on a city skyline at night.,photo
544848,1334530626716889094,2020-12-03 16:11:10,Month nine of quarantimes: started reading @TheMontyDon to my houseplants https://t.co/uK2d6VkIZC,https://t.co/uK2d6VkIZC,http://pbs.twimg.com/media/EoU0h-3W4Acr6gN.jpg,"Molly reading Monty Don's book ""Down to Earth"" to a giant monstera plant and a shelf full of pothos and Christmas cacti",photo
1762045,1333576577502896131,2020-12-01 01:00:07,Pizza cat https://t.co/6Cw2wkLL6U,https://t.co/6Cw2wkLL6U,http://pbs.twimg.com/media/EoHQ0f5XMAEVzqI.jpg,Tabby kitten (big) on hind legs with front paws pressed against front of oven. Empty pizza box on counter next to stove.,photo
948926,1334366595813158913,2020-12-03 05:19:22,a new Mol bust bc why not\n#fursona #furry #furryart https://t.co/MHkfZjdoVF,https://t.co/MHkfZjdoVF,http://pbs.twimg.com/media/EoSe2-ZXIAAMKbH.jpg,"An anthro jerboa who has pink, blue and white fur, blue eyes, and a pink nose. They're tilting their head to the side and smiling. They have a purple jumper, and are on a red background.",photo
2372033,1334259455899398145,2020-12-02 22:13:38,this is a cursed tradition https://t.co/oBrZ4umzUb,https://t.co/oBrZ4umzUb,http://pbs.twimg.com/media/EoQ9ZpdXYAAMKlV.jpg,"google search ""does spotify wrapped include december or do I have to live with my shame""",photo
98392,1334186107748061184,2020-12-02 17:22:10,"Good morning, moon. \n\n85mm stopped down to f/5.6 @ 1/4000s. Shot with in-camera b&amp;w, edited &amp; cropped in Lightroom. https://t.co/kcjL2wtM9j",https://t.co/kcjL2wtM9j,http://pbs.twimg.com/media/EoP7AY8VcAAqKZW.jpg,"A black & white photo of the moon (waning gibbous) through tree branches on the lower right corner, balanced in the upper left corner by a flock of pigeons.",photo


In [23]:
df_alt_text.shape

(3005, 7)

In [24]:
df['created_at'].max()

Timestamp('2020-12-05 01:53:43')

In [25]:
df_alt_text.sample(10)

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type
1553838,1335006904779730944,2020-12-04 23:43:43,Look at this spread https://t.co/Bgg7bPLT7b,https://t.co/Bgg7bPLT7b,http://pbs.twimg.com/media/Eobln6cXcAEXZn4.jpg,Five plates of various sushi pieces cut to show fish and cream cheese,photo
927143,1334368657204502530,2020-12-03 05:27:33,Happy lid is happy. https://t.co/MJEuNuco8w,https://t.co/MJEuNuco8w,http://pbs.twimg.com/media/EoShN05XUAccLgE.jpg,"Bild zeigt Deckel einer Erdnussdose, es sind Rillen eingestanzt, die wie ein Gesicht aussehen.",photo
2208584,1333805472114208770,2020-12-01 16:09:40,I made Lucy a crude tent a few days ago and so far she has only left it for food-in or food-out. https://t.co/yNr5EwRzrO,https://t.co/yNr5EwRzrO,http://pbs.twimg.com/media/EoKhADpWMAM72n5.jpg,"A calico cat on a couch, under a fort made from a blue blanket.",photo
2372033,1334259455899398145,2020-12-02 22:13:38,this is a cursed tradition https://t.co/oBrZ4umzUb,https://t.co/oBrZ4umzUb,http://pbs.twimg.com/media/EoQ9ZpdXYAAMKlV.jpg,"google search ""does spotify wrapped include december or do I have to live with my shame""",photo
1623187,1335000550241411075,2020-12-04 23:18:28,"I got spicy skewers and the bag says, ""In every person's stomach lives a little devil named *gluttony*"" https://t.co/QuTjfO9yuy",https://t.co/QuTjfO9yuy,http://pbs.twimg.com/media/EobfTIUW4Ac3C5n.jpg,"a bright yellow bag with skewers poking put the top, and Chinese writing on the bag",photo
249011,1333830042682609664,2020-12-01 17:47:18,I fended off kitty village-pillage by installing a guard dog. https://t.co/TugGwbPz1E https://t.co/1UbxIfba8D,https://t.co/1UbxIfba8D,http://pbs.twimg.com/media/EoK3VO0XYAA-vRI.jpg,Advent calendar village with dog on sofa.,photo
2131455,1334699273917591554,2020-12-04 03:21:19,If you in Vegas hit me!! https://t.co/vFSBB2wVMK,https://t.co/vFSBB2wVMK,http://pbs.twimg.com/media/EoXN5fLU8AABv_i.jpg,Tap in I’m here ... Vegas,photo
1181397,1334655300008202241,2020-12-04 00:26:34,"Let it snow, let it snow, let it fucking snow. 🌨 #Anon #m3 https://t.co/3XirzESxQ4",https://t.co/3XirzESxQ4,http://pbs.twimg.com/media/EoWl62IUUAEq3uI.jpg,What you need if your visually impaired,photo
2393356,1333222203857530883,2020-11-30 01:31:58,Return from Witch Mountain (1978) https://t.co/58WJyhpmrX,https://t.co/58WJyhpmrX,http://pbs.twimg.com/media/EoCOh01XcAAhE-V.png,Return from Witch Mountain (1978),photo
683403,1334523018748567552,2020-12-03 15:40:56,"Combined day, me, vs robot! Robot won, but - Nice day! https://t.co/2z4QMfym5l",https://t.co/2z4QMfym5l,http://pbs.twimg.com/media/EoUtOHxXcAcx1Qz.png,Me,photo


## Lowercase and remove punctuation

In [26]:
import re
import string

In [27]:
# # preprocess text

# text = " ".join(caption for caption in df_alt_text['alt_text']).lower()
# # remove punctuation
# text = re.sub(r'[^\w]', ' ',text)
# # remove numbers
# text = re.sub('\w*\d\w*', ' ',text)

In [28]:
# text = " ".join(caption for caption in df_alt_text['alt_text']).lower()
# text = "".join(filter(str.isalpha(),text))
# text[:30]

In [29]:
# text = " ".join(caption for caption in df_alt_text['alt_text']).lower()
# text = re.sub(r'[^a-zA-Z]',' ',text)
# text[:500]

In [30]:
df_alt_text['alt_text_clean'] = df_alt_text['alt_text'].str.replace('[^a-zA-Z ]+', '').str.lower()
df_alt_text.sample(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_alt_text['alt_text_clean'] = df_alt_text['alt_text'].str.replace('[^a-zA-Z ]+', '').str.lower()


Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type,alt_text_clean
143557,1334183190207008771,2020-12-02 17:10:35,@gankstrr Right there with you https://t.co/DkChQmH0OX,https://t.co/DkChQmH0OX,http://pbs.twimg.com/media/EoP4ihSXUAAygpJ.jpg,"“You listened to 154 genres this year, including 61 new ones.”",photo,you listened to genres this year including new ones
1876569,1334722530473615360,2020-12-04 04:53:43,Fiesta https://t.co/YEVpaHvtE1,https://t.co/YEVpaHvtE1,http://pbs.twimg.com/media/EoXjEI7XUAAdjoO.jpg,Fiesta....,photo,fiesta


In [31]:
words = nltk.corpus.words.words()
df_alt_text['alt_text_cleaner'] = [' '.join(y for y in x.split() if y in words) for x in df_alt_text['alt_text_clean']]
#df_alt_text['alt_text_cleaner'] = [word for word in df_alt_text['alt_text_clean'].str.split() if word in words]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_alt_text['alt_text_cleaner'] = [' '.join(y for y in x.split() if y in words) for x in df_alt_text['alt_text_clean']]


In [32]:
df_alt_text.head()

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type,alt_text_clean,alt_text_cleaner
747,1333857393801891842,2020-12-01 19:35:59,"@Hallmark I just wanted to buy stuff, not be assaulted (´༎ຶོρ༎ຶོ`) https://t.co/H617bRVeCV",https://t.co/H617bRVeCV,http://pbs.twimg.com/media/EoLQOiKXYAY52gk.jpg,I’ve never met a more hostile pinecone in my life,photo,ive never met a more hostile pinecone in my life,never met a more hostile in my life
1003,1333857368677859329,2020-12-01 19:35:53,"Happy for Elliot Page, and also relieved https://t.co/RUF0l6aYTT",https://t.co/RUF0l6aYTT,http://pbs.twimg.com/media/EoLNBYeW8AANYfE.png,"Under ""Trends for Portland, United States,"" ""Elliot"" is listed as No. 1",photo,under trends for portland united states elliot is listed as no,under for united is listed as no
2032,1333857294115889154,2020-12-01 19:35:35,I have been doing some color-coding lately. https://t.co/J0ZU1oCgMI,https://t.co/J0ZU1oCgMI,http://pbs.twimg.com/media/EoLQFU5XIAUta9w.jpg,An Atreus keyboard with multi color keys.,photo,an atreus keyboard with multi color keys,an keyboard with color
2117,1333857290345123844,2020-12-01 19:35:34,'Hats off' to 'mask on'! \n\n📸: lailucien (IG) https://t.co/dw9GxtNaxu,https://t.co/dw9GxtNaxu,http://pbs.twimg.com/media/EoLQCVZXcAQbhkO.jpg,Woman with mask in subway station,photo,woman with mask in subway station,woman with mask in subway station
3013,1333857218282868737,2020-12-01 19:35:17,This is what I have to put up with. https://t.co/rWBp9dtj4e,https://t.co/rWBp9dtj4e,http://pbs.twimg.com/media/EoLP6pzWEAIAVDF.jpg,The cat has plunged himself into the Christmas tree. He is definitely eating it.,photo,the cat has plunged himself into the christmas tree he is definitely eating it,the cat himself into the tree he is definitely eating it


In [33]:
df_alt_text['alt_text_clean_len'] = df_alt_text['alt_text_clean'].str.split(" ").str.len()
df_alt_text['alt_text_clean_len'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_alt_text['alt_text_clean_len'] = df_alt_text['alt_text_clean'].str.split(" ").str.len()


count    3005.000000
mean       11.497504
std        13.734878
min         1.000000
25%         4.000000
50%         7.000000
75%        14.000000
max       180.000000
Name: alt_text_clean_len, dtype: float64

In [34]:
df_alt_text.to_clipboard()

In [35]:
# did a bunch of eyeballing in a spreadsheet, and captions with more than 6/fewer than 36 words seemed to be a reasonable quality cutoff
df_alt_text = df_alt_text[(df_alt_text['alt_text_clean_len'] > 5) & (df_alt_text['alt_text_clean_len'] < 36)]
df_alt_text.shape

(1720, 10)

# Remove 'photo/picture of'

In [36]:
df_alt_text['alt_text_clean'] = df_alt_text['alt_text_clean'].str.replace('photo of ','')
df_alt_text['alt_text_clean'] = df_alt_text['alt_text_clean'].str.replace('picture of ','')

In [37]:
df_alt_text.sample(5)

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type,alt_text_clean,alt_text_cleaner,alt_text_clean_len
1573779,1335005042722676739,2020-12-04 23:36:20,This James Baldwin quote is gold https://t.co/v3HGj2KygH,https://t.co/v3HGj2KygH,http://pbs.twimg.com/media/EobkAubXIAENO_i.jpg,Yes when I was young I knew a lot! Now I don't know nothing which is a great relief,photo,yes when i was young i knew a lot now i dont know nothing which is a great relief,yes when i was young i knew a lot now i dont know nothing which is a great relief,19
2408476,1331397529691451393,2020-11-25 00:41:21,lorde enjoying not giving us what we really want https://t.co/FKlYhznVfo,https://t.co/FKlYhznVfo,http://pbs.twimg.com/media/EnoS_hXWEAAnB-v.png,Well. I’m here in your inbox because I have some exciting news. Not that news. Don’t get too excited. Just a moderate amount.,photo,well im here in your inbox because i have some exciting news not that news dont get too excited just a moderate amount,well here in your because i have some exciting news not that news dont get too excited just a moderate amount,23
1827581,1334726662425112578,2020-12-04 05:10:09,“Switchin' them positions for you” 🎶🎵\n\n#positions #ag6 #leatherface #ArianaGrande #horrorfam https://t.co/V36wjXHkWU,https://t.co/V36wjXHkWU,http://pbs.twimg.com/media/EoXm0QRW8AEwVAD.jpg,A photo of Leatherface from the Texas Chain Saw Massacre holding a mallet and wearing an apron over a button up and tie.,photo,a leatherface from the texas chain saw massacre holding a mallet and wearing an apron over a button up and tie,a photo of from the chain saw massacre holding a mallet and wearing an apron over a button up and tie,23
1680637,1333559761384529920,2020-11-30 23:53:18,The guinea pig of plumpness is blessing timelines because let’s fuckin’ go https://t.co/zww70jQORa,https://t.co/zww70jQORa,http://pbs.twimg.com/media/EoHBiMrUwAEYXt4.jpg,"A rotund guinea pig sitting on a plump butt, chocolate brown furry glee, big feet, is very cute",photo,a rotund guinea pig sitting on a plump butt chocolate brown furry glee big feet is very cute,a rotund guinea pig sitting on a plump butt chocolate brown furry glee big is very cute,18
1130693,1334659446316937217,2020-12-04 00:43:03,They are his. https://t.co/u8wg7cYkQ3,https://t.co/u8wg7cYkQ3,http://pbs.twimg.com/media/EoWpsQrXIAAj7dO.jpg,Ogie the orange tabby holding two balls under his arm,photo,ogie the orange tabby holding two balls under his arm,the orange tabby holding two under his arm,10


In [38]:
df_alt_text.to_pickle('twitter_alt_text.pkl')

## Remove emojis

In [39]:
#demoji.download_codes()

In [40]:
#text = demoji.replace(text)

## Remove proper nouns

In [41]:
# identify all proper nouns
tagged_text = pos_tag(text.split())
propernouns = [word for word,pos in tagged_text if pos == 'NNP']
len(propernouns)

NameError: name 'text' is not defined

In [42]:
# check what the propernouns look like... nltk doesn't seem to be doing a great job identifying them
propernouns[:30]

NameError: name 'propernouns' is not defined

second way of trying this... keep only english words

In [None]:
words = set(nltk.corpus.words.words())
text = " ".join(word for word in nltk.wordpunct_tokenize(text))

In [None]:
text[:1000]

In [None]:
tagged_text = pos_tag(text.split())
propernouns = [word for word,pos in tagged_text if pos == 'NNP']

In [None]:
len(propernouns)

# Download Images

cutoff (max timestamp) for first image download: `Timestamp('2020-12-02 22:57:08')`

In [None]:
urls = list(df_alt_text['img_url'])

In [None]:
# counter = 0
# for i in urls:
# #     try:
#         filename = 'twitter_images/'+i[27:46]
#         response = requests.get(i)
#         file = open(filename, "wb")
#         file.write(response.content)
#         file.close()
#         if counter % 20 == 0:
#             print(f'{counter} images downloaded.')
#         counter +=1
# #     except:
# #         pass

# Topic Modeling

## Preprocess data

In [62]:
from SpacyPreprocessor import SpacyPreprocessor

In [63]:
df_tm.sample(30)

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type
1704206,1333883862498369542,2020-12-01 21:21:09,@CallMeAgent00 @SiimplyGrinding @ImDukeDennis @imdavisss Never boxing again https://t.co/a1a5RxiSj5,https://t.co/a1a5RxiSj5,http://pbs.twimg.com/media/EoLoTcqXIAIp2Vx.jpg,Never boxing again,photo
1107876,1334661322215067649,2020-12-04 00:50:30,Test shoots of Lia T from Upfront Models. Sitting down portraits.\n\n#testshoot #studiophotography #beautyphotoshoot https://t.co/rwiUUGKlFK,https://t.co/rwiUUGKlFK,http://pbs.twimg.com/media/EoWrY0pU0AU6dAE.jpg,Asian beauty lia tan sleevesless top smiling portrait,photo
1721958,1333882419208998912,2020-12-01 21:15:25,Made a start on Christmas today... https://t.co/8iR1HEF1Qq,https://t.co/8iR1HEF1Qq,http://pbs.twimg.com/media/EoLm_JRWMAEtNG8.jpg,Our Christmas tree for 2020.,photo
2240745,1334269767243350017,2020-12-02 22:54:36,I have a new favourite term. Although I feel I might be calling it Homeric Expanded Universe rather than just Homer. https://t.co/j2yD50Os1g,https://t.co/j2yD50Os1g,http://pbs.twimg.com/media/EoRHEYpUUAA80nM.png,Tumblr post by britneyshakespearean: getting shot in the ankle is probably what achilles is best known for but thats not even in the iliad its part of the heu (homer expanded universe),photo
1772382,1333575721638391812,2020-12-01 00:56:43,me: I should perform a task\nexecutive dysfunction: https://t.co/vJwjklfl6I,https://t.co/vJwjklfl6I,http://pbs.twimg.com/media/EoHQDPxXMAEgYra.jpg,"A screenshot of a message in a Gmail inbox. Subject line reads “Whoa....you DON’T want to...” and is cut off after that. Sender is B&N UD Bookstore, time is 7:06 PM.",photo
1178254,1334655553805684742,2020-12-04 00:27:35,@Hood_Biologist Dr. Rodrigues asks a good question though: https://t.co/sztdSe5asJ,https://t.co/sztdSe5asJ,http://pbs.twimg.com/media/EoWlumoU0AAT2n4.png,"Tweet from Dr. Rodrigues that says ""Primatologists spend so much of our effort on social media, in person, in so many forms of public engagement trying to get the message across that #PrimatesAreNotPets and these images cause harm. Why is [it] ha...",photo
1244061,1335036566197112837,2020-12-05 01:41:35,What are the scent notes for Yankee Candle Xmas Orgy do you think?\n\nSource: @radillustrates https://t.co/OVogl7lMvl https://t.co/Ss292J7X1A,https://t.co/Ss292J7X1A,http://pbs.twimg.com/media/EocAaHNW8AAL71r.png,"cartoon drawing of a yankee candle jar with ""Yankee Candle Xmas Orgy"" on the label with cartoon people having an orgy",photo
973890,1334364317563555841,2020-12-03 05:10:19,"@dumplinghoee Meanwhile, Rahul Behind the stumps\n#INDvsAUS \n#justiceforKXIP https://t.co/zxotxPSv3F",https://t.co/zxotxPSv3F,http://pbs.twimg.com/media/EoSdRQ2U4AAwaVT.jpg,Dhokebaaz Maxi,photo
1892093,1334721026240950274,2020-12-04 04:47:45,The Dublin Journal of Medical Science (Published: 1903)\nFull text: https://t.co/INWBYWTQHI https://t.co/E5w2W8zBHM,https://t.co/E5w2W8zBHM,http://pbs.twimg.com/media/EoXhsnrW4AEHxsA.jpg,Page image from The Dublin Journal of Medical Science,photo
583180,1334528726504239104,2020-12-03 16:03:37,Success by Frosty❄#1785 in @Join_Wolfpack https://t.co/hveqCJGDlZ,https://t.co/hveqCJGDlZ,http://pbs.twimg.com/media/EoUyzbvXUAAImdT.jpg,Success by Frosty❄#1785 in @Join_Wolfpack,photo


### lemmatize and remove numbers, symbols, POS with SpaCy

In [64]:
spacy_model = SpacyPreprocessor.load_model()

In [65]:
preprocessor = SpacyPreprocessor(spacy_model=spacy_model, lemmatize=True, remove_numbers=True, 
                                 remove_stopwords=False, remove_special=True, 
                                 pos_to_remove=['ADP','SYM','NUM','AUX'])
df_tm['spacy_pipe'] = preprocessor.preprocess_text_list(list(df_tm['alt_text']))

4881it [00:04, 1006.27it/s]


In [66]:
df_tm.sample(7)
# could try joining tweet text PLUS alt text as documents

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type,spacy_pipe
529929,1334531548872314880,2020-12-03 16:14:50,"Really enjoying the Sweetwater sound mag, SweetNotes 👀👀 @SweetwaterSound thanks for sending it! https://t.co/EP5Vy3QSwC",https://t.co/EP5Vy3QSwC,http://pbs.twimg.com/media/EoU1XnHW8AEnzw-.jpg,Sweetwater gear mag On table with Christmas flower,photo,sweetwater gear mag table christmas flower
2269308,1334267424460349443,2020-12-02 22:45:18,@200okpublic ^_^ - Perfect logo? Happened have one of these lying around... https://t.co/I7GkQZjVqQ,https://t.co/I7GkQZjVqQ,http://pbs.twimg.com/media/EoRFJmlUYAAl31S.jpg,Alt for the previous image's tweet: A Lego mini fig with a blue beanie I happened to to coincidently have which describe @200okpublic's previous tweet on UI community logo.,photo,alt the previous image 's tweet a lego mini fig a blue beanie pron happen to coincidently which describe okpublic 's previous tweet ui community logo
282259,1332483102665547789,2020-11-28 00:35:02,Sale extended through Monday 🖤\nhttps://t.co/EeD9XMQUYs\n #cdcoshops https://t.co/1XsA5gm7iw,https://t.co/1XsA5gm7iw,http://pbs.twimg.com/media/En3uUZdXMA08y7u.jpg,Black Business Friday Virtual Experience logo,photo,black business friday virtual experience logo
835077,1334378440754143232,2020-12-03 06:06:26,Can we all take a moment to appreciate the true horror of what motherhood has done for my Spotify Wrapped.... https://t.co/1a8hUntnLs,https://t.co/1a8hUntnLs,http://pbs.twimg.com/media/EoSqHbSWMAEoik7.jpg,Your top song of the year is once soon a dream from the Disney Princesses album,photo,pron top song the year once soon a dream the disney princesses album
1688824,1333884943638945794,2020-12-01 21:25:27,Happy new month https://t.co/f31KnXpl7H,https://t.co/f31KnXpl7H,http://pbs.twimg.com/media/EoLpQomXUAA99fy.jpg,Happy new month,photo,happy new month
2322619,1334263169691226113,2020-12-02 22:28:23,I DIDN'T EVEN FINISH MY ROLLS https://t.co/FS6M5XyWfH,https://t.co/FS6M5XyWfH,http://pbs.twimg.com/media/EoRBMKNU0AIcTk0.jpg,CHONGYUN,photo,chongyun
645959,1334525313418661889,2020-12-03 15:50:03,9 Best #Online #Marketing Channels for You to Try in 2021. https://t.co/ALzmO3gPMk https://t.co/jAALSX0mmI,https://t.co/jAALSX0mmI,http://pbs.twimg.com/media/EoUvs0WWMAU6zgi.jpg,"entrepreneur, idea, competence",photo,entrepreneur idea competence


### Check for and remove super short documents

In [67]:
df_tm['spacy_pipe_len'] = df_tm['spacy_pipe'].str.split(" ").str.len()
df_tm['spacy_pipe_len'].describe()

count    4881.000000
mean       17.015775
std        22.858222
min         1.000000
25%         5.000000
50%        10.000000
75%        20.000000
max       291.000000
Name: spacy_pipe_len, dtype: float64

In [68]:
df_tm = df_tm[df_tm['spacy_pipe_len'] >= 5]

## Try different models

In [69]:
%load_ext autoreload
%autoreload 2

In [70]:
from TopicModeling import topic_model
import itertools
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [71]:
# define corpus as the spacy-processed version of the data
corpus = df_tm['spacy_pipe']

In [72]:
# define stopwords
stop_words = spacy_model.Defaults.stop_words

# add custom stopwords to stopwords list
custom_stopwords = ["pron"]

for s in custom_stopwords:
    stop_words.add(s)

In [73]:
params = {
    'stop_words':stop_words,
    'min_df':10,
    'ngram_range':(1, 2)
}
vectorizers = [CountVectorizer(**params),TfidfVectorizer(**params)]
models = ['lsa', 'nmf', 'lda']

combinations = list(itertools.product(vectorizers, models))

In [74]:
# for c in combinations:
#     #print(c)
#     topic_model_c = topic_model(data=corpus,
#                               vectorizer=c[0],
#                               model=c[1],
#                               num_topics=10)
#     topics = topic_model_c.get_topics()
        

In [75]:
# at_topics = topic_model(data=corpus,
#                              vectorizer=CountVectorizer(**params),
#                             model='lsa',num_topics=10)
# at_topics_cv_lsa = at_topics.get_topics()

In [103]:
# fit vectorizer
vectorizer = CountVectorizer(**params)
doc_word_matrix = vectorizer.fit_transform(corpus)
doc_word_matrix.shape



(3716, 743)

In [104]:
# create and fit decomposition model
nmf = NMF(n_components=12,random_state=7)

# create the document-topic matrix
doc_topic_matrix = nmf.fit_transform(doc_word_matrix)

# create columns names
topicnames = ['Topic_' + str(i) for i in range(nmf.n_components)]

# index names
docnames = ['AltText_' + str(i) for i in range(len(corpus))]

# create a dataframe
df_doc_topic = pd.DataFrame(np.round(doc_topic_matrix,4), columns=topicnames, index=docnames)

df_doc_topic.head()

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,Topic_11
AltText_0,0.0077,0.005,0.0098,0.0413,0.0048,0.0056,0.0066,0.0,0.0267,0.0094,0.0018,0.0091
AltText_1,0.0,0.2578,0.0,0.0155,0.0054,0.0,0.0,0.0,0.0143,0.0044,0.0,0.0143
AltText_2,0.0002,0.001,0.0019,0.005,0.0007,0.0,0.0,0.0,0.0003,0.0005,0.0,0.0022
AltText_3,0.0,0.0018,0.0026,0.0,0.0,0.032,0.0,0.0,0.0,0.0033,0.0,0.0009
AltText_4,0.0027,0.0044,0.0005,0.0006,0.0009,0.0,0.0,0.0053,0.0072,0.0071,0.0125,0.0173


In [105]:
def show_topics(vectorizer, model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [106]:
topic_keywords = show_topics(vectorizer, nmf, 15)
topic_keywords

[array(['blue', 'red', 'wear', 'background', 'green', 'pink', 'dark',
        'hair', 'light', 'yellow', 'purple', 'brown', 'eye', 'tree',
        'grey'], dtype='<U18'),
 array(['text', 'image', 'read', 'image text', 'automatic',
        'automatic image', 'text read', 'background', 'right', 'view',
        'pay', 'page', 'book', 'set', 'item'], dtype='<U18'),
 array(['com', 'https', 'www', 'https www', 'instagram', 'twitter',
        'follow', 'video', 'group', 'study', 'twitter com', 'office',
        'join', 'comment', 'shot'], dtype='<U18'),
 array(['like', 'love', 'know', 'time', 'thing', 'number', 'people',
        'good', 'feel', 'care', 'word', 'day', 'want', 'look like',
        'small'], dtype='<U18'),
 array(['black', 'wear', 'wear black', 'hair', 'pink', 'shirt',
        'black white', 'silver', 'tie', 'man', 'people', 'jacket',
        'purple', 'dress', 'french'], dtype='<U18'),
 array(['screenshot', 'tweet', 'read', 'spotify', 'song', 'picture',
        'wrap', 'spotify

In [107]:
# add original comments text back into doc_topic matrix
df_doc_topic['orig_comments'] = df_tm['alt_text'].values
df_doc_topic.sample(2)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,Topic_11,orig_comments
AltText_2017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Mallard Duck at Potter Marsh and Spruce Grouse in the Chugach Range.
AltText_3222,0.0,0.0079,0.0,0.0092,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0003,“Your top artist was The National. You were in the top 1% of their listeners this year.”


In [108]:
def top_docs(df_doc_topic, topic, n_docs):
    return (df_doc_topic
            .sort_values(by=topic, ascending=False)
            .head(n_docs)['orig_comments']
            .values)

In [109]:
topic_docs = top_docs(df_doc_topic, 'Topic_11',10)
topic_docs

array(['using System;\nusing System.Collections.Generic;\nusing System.ComponentModel;\nusing System.Data;\nusing System.Drawing;\nusing System.Linq;\nusing System.Text;\nusing System.Threading.Tasks;\nusing System.Windows.Forms;\n\nnamespace WindowsFormsApplication6\n{\n    public partial class Form1 : Form\n    {\n        private MyFont _font;\n        public Form1()\n        {\n            InitializeComponent();\n        }\n\n        priv...',
       'Your next steps depend on the total of the Used column from the df -h command above.      If you’re using less space than your intended plan requires, you can move onto the next step without any further action.     If you’re using more space than your intended plan allows, you need to remove some files to free up some space before moving onto the next step. See the options for doing this in the Download Files from Your Linode guide.  Before resizing your Linode to a new plan, you need to resize the disk to match the storage volume of t

# Export data for image clustering

In [116]:
df

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type
0,1333857441956519936,2020-12-01 19:36:10,Our lil tiny Christmas tree I kinda wanna pain...,https://t.co/NxNnBooOv7,http://pbs.twimg.com/media/EoLQRYPWEAAIoum.jpg,,photo
1,1333857440949932032,2020-12-01 19:36:10,"Giroud, tammy and cavani 😂🤣 https://t.co/BMjJu...",https://t.co/geWysH2kml,http://pbs.twimg.com/media/EoLQRdTW4AAa4K-.jpg,,photo
2,1333857445312139265,2020-12-01 19:36:11,Some facts about the Quran❤️ https://t.co/6NZT...,https://t.co/6NZTWBjQwH,http://pbs.twimg.com/media/EoLQRMlXYAkv58_.jpg,,photo
4,1333857442283675650,2020-12-01 19:36:10,"Sure, you've met me, but have you met 18 y/o M...",https://t.co/nrd1zhT9u8,http://pbs.twimg.com/media/EoLQQtkW4AEjNGu.png,,photo
5,1333857441734389760,2020-12-01 19:36:10,@DeanBrowningPA @AOC This you? https://t.co/n8...,https://t.co/n8rnhl22dr,http://pbs.twimg.com/media/EoLQRg3W4AA2207.jpg,,photo
...,...,...,...,...,...,...,...
2415463,1331004682823733249,2020-11-23 22:40:19,Now For Sale:\nBatman Beyond #49 Manapul Varia...,https://t.co/I6jJIVSPKN,http://pbs.twimg.com/media/EnittEwXUAcQdiA.jpg,,photo
2415464,1331004677983334401,2020-11-23 22:40:18,no to body shame💓 https://t.co/z6bTc8nztW,https://t.co/z6bTc8nztW,http://pbs.twimg.com/media/Enitf2MVoAILmrN.jpg,,photo
2415465,1331004680684539907,2020-11-23 22:40:19,It's nice and cool!! \nDrum camp in Greece. ht...,https://t.co/azdOUwCJZM,http://pbs.twimg.com/media/Enits-_UYAAUn2G.jpg,,photo
2415466,1331004682555285504,2020-11-23 22:40:19,I should tweet it with this but ight https://t...,https://t.co/00e1ShH0Hx,http://pbs.twimg.com/media/EnitrQgWMAA1AjW.jpg,,photo


In [121]:
rand550 = df.sample(550)

In [122]:
urls = list(rand550['img_url'])

counter = 0
for i in urls:
#     try:
        filename = 'clustering_images/'+i[27:46]
        response = requests.get(i)
        file = open(filename, "wb")
        file.write(response.content)
        file.close()
        if counter % 50 == 0:
            print(f'{counter} images downloaded.')
        counter +=1
#     except:
#         pass

0 images downloaded.
50 images downloaded.
100 images downloaded.
150 images downloaded.
200 images downloaded.
250 images downloaded.
300 images downloaded.
350 images downloaded.
400 images downloaded.
450 images downloaded.
500 images downloaded.
