# Import Packages and Data

In [38]:
# basics
import pandas as pd 
import numpy as np

# files
import glob
import pickle
import os
import requests

# ocr
import pytesseract as pytesseract
from PIL import Image

# nlp
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
import demoji

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/labbot/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [39]:
# all the data scraped using twitter_scraping is saved as pickle files; extract and save as a dataframe
path = './twitter_data'
all_files = glob.glob(path + "/*.pickle")

li = []

for filename in all_files:
    one_df = pd.read_pickle(filename)
    li.append(one_df)

df_raw = pd.concat(li, axis=0, ignore_index=True)
df_raw.shape

(2415468, 7)

# Analysis of alt text prevalence

In [40]:
# wasn't able to exclude gifs in tweepy query, so remove them now
df = df_raw[df_raw['media_type'] == 'photo']
df.shape

(1985705, 7)

In [41]:
# how many photos have alt text?
alt_text_count = sum(df['alt_text'].notnull())
total_photos = len(df)
alt_text_rate = alt_text_count/total_photos

print(f'{alt_text_rate:%} of photos in the sample of {total_photos} have user-provided alt text.')
print(f"that's {alt_text_count} photos with (uncleaned) alt text.")

0.300498% of photos in the sample of 1985705 have user-provided alt text.
that's 5967 photos with (uncleaned) alt text.


In [42]:
# explore some of the alt text available
df_alt_text = df[df['alt_text'].notnull()]
df_alt_text.sample(7)

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type
1416524,1335019810334396417,2020-12-05 00:35:00,Thanks...I hate it https://t.co/FA6r8IlaLJ,https://t.co/FA6r8IlaLJ,http://pbs.twimg.com/media/EobxcS9XMAA4ppt.jpg,Buzzsprout release date screen with the name o...,photo
2086543,1334703124951470083,2020-12-04 03:36:37,I have 1 new followers from USA last week. See...,https://t.co/4XkkCnN0ID,http://pbs.twimg.com/media/EoXRa0RXIAIN9VU.jpg,User report,photo
1470656,1335014616301264896,2020-12-05 00:14:22,"For him or for her, there is something for you...",https://t.co/VJhpAjIVOl,http://pbs.twimg.com/media/EobrdAZXEAg_ulm.png,"We add new products everyday. Vibrators, dildo...",photo
1631405,1334999798060093440,2020-12-04 23:15:29,Its better to own the racecourse then the race...,https://t.co/sY086xb34B,http://pbs.twimg.com/media/EobfPdAXMAUtJsX.jpg,Something went wrong,photo
2152893,1334697552390701056,2020-12-04 03:14:28,sch! gunanya brainly apa lol https://t.co/KYdv...,https://t.co/KYdvfK0ahG,http://pbs.twimg.com/media/EoXMWYqVQAIkLoi.jpg,sch! gunanya brainly apa lol,photo
1427782,1335018726081638401,2020-12-05 00:30:42,I feel seen @hbomax https://t.co/6CRftbAfqF,https://t.co/6CRftbAfqF,http://pbs.twimg.com/media/EobwPtQXMAEobyv.jpg,"Die Hard, Die Hard2, Leathal Weapon",photo
602394,1334527753706594305,2020-12-03 15:59:45,Happy birthday mas ganteng~\n#MasGanteng \n#Wi...,https://t.co/1QjSGF8kG7,http://pbs.twimg.com/media/EoUx5xpVEAAUinz.jpg,💜💜 💜,photo


In [43]:
# pd.options.display.max_colwidth = 250
# df_alt_text.loc[[39657]]

# Preprocess alt text to use for modeling
Glancing through, it (unsurprisingly) looks like the user-created alt text in my corpus will need a fair bit of cleaning to be generalizable. These are the preprocessing steps I'll take:
1. remove pre-populated stock text like "discord image", "something went wrong", etc.
2. remove photos of text (I'm not trying to create an OCR model)
3. remove proper nouns
4. remove "photo of" or "picture of", since those could confuse the model and make for bad alt text anyways
5. remove any captions that are too short the above steps

## Remove records with junk alt text

In [44]:
# remove pre-populated, uninformative captions
# which of these appear frequently?
df_alt_text['alt_text'].value_counts().head(7)

User report                                                                                                                                                                                                                                                                                                                                                             262
Something went wrong                                                                                                                                                                                                                                                                                                                                                    248
Discord Image                                                                                                                                                                                                                                                                   

Interesting. It looks like most of these are junk, only one would be considered "good" alt text (brief, descriptive, natural language). Are they all the same image?

In [45]:
df_alt_text[df_alt_text['alt_text']=='Small flowers in a planter on a sunny balcony, blossoming.']

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type
182004,1334180798027718658,2020-12-02 17:01:04,"Congrats to $ALGO, the Biggest Green Dildo of ...",https://t.co/NEx3ZGT7fs,http://pbs.twimg.com/media/EoP2XSkW4AAlI7Y.png,"Small flowers in a planter on a sunny balcony,...",photo
205463,1333833309126860800,2020-12-01 18:00:16,"The Biggest Green Dildo of the Day: $SUSHI, +1...",https://t.co/vvaWqLUxtD,http://pbs.twimg.com/media/EoK6Uy-W4AI8aRj.png,"Small flowers in a planter on a sunny balcony,...",photo
317926,1333607122202869762,2020-12-01 03:01:29,"Congrats to $QNT, the Biggest Green Dildo of t...",https://t.co/umrX9p6dUD,http://pbs.twimg.com/media/EoHsm-YXMAAp7_h.png,"Small flowers in a planter on a sunny balcony,...",photo
361878,1334331809526673408,2020-12-03 03:01:08,"Congrats to $SUSHI, the Biggest Green Dildo of...",https://t.co/HuCaO3bKao,http://pbs.twimg.com/media/EoR_tV4W8AAJyqV.png,"Small flowers in a planter on a sunny balcony,...",photo
595889,1334528062520758272,2020-12-03 16:00:59,"Congrats to $AAVE, the Biggest Green Dildo of ...",https://t.co/RULWz9Y9A4,http://pbs.twimg.com/media/EoUyMxaXcAYKV4Z.png,"Small flowers in a planter on a sunny balcony,...",photo
847141,1334377106093051904,2020-12-03 06:01:08,"Congrats to $LUNA, the Biggest Green Dildo of ...",https://t.co/8hp5hJsbiw,http://pbs.twimg.com/media/EoSo58RW4AAs45r.png,"Small flowers in a planter on a sunny balcony,...",photo
999454,1334361981638365185,2020-12-03 05:01:02,"Congrats to $LUNA, the Biggest Green Dildo of ...",https://t.co/HIygT9Skbk,http://pbs.twimg.com/media/EoSbJhSXEAEMCRB.png,"Small flowers in a planter on a sunny balcony,...",photo
1077086,1334663933802770435,2020-12-04 01:00:53,"Congrats to $SC, the Biggest Green Dildo of th...",https://t.co/t66eg4BAsG,http://pbs.twimg.com/media/EoWtxfvXMAIiG3u.png,"Small flowers in a planter on a sunny balcony,...",photo
1349512,1335026328890986496,2020-12-05 01:00:55,"Congrats to $XEM, the Biggest Green Dildo of t...",https://t.co/dkYTU3OXYJ,http://pbs.twimg.com/media/Eob3XsEXcAURKgf.png,"Small flowers in a planter on a sunny balcony,...",photo
1506586,1335011288058130436,2020-12-05 00:01:09,"Congrats to $SC, the Biggest Green Dildo of th...",https://t.co/YG4DqP8aZf,http://pbs.twimg.com/media/EobpsLZXIAc21Ne.png,"Small flowers in a planter on a sunny balcony,...",photo


oooooohkay. Definitely not a good caption. Not sure why this image has such a mismatched alt text (these tweets look spammy), but hopefully this isn't too common in my dataset.

Based on this, I'm going to make the assumption that any caption that's appearing more than once in my dataset is likely junk, and just get rid of all of those.

In [46]:
vc = df_alt_text['alt_text'].value_counts()
recurrent_alt_texts = list(vc[vc > 1].index)
df_alt_text = df_alt_text[~df_alt_text['alt_text'].isin(recurrent_alt_texts)]
df_alt_text.shape

(4881, 7)

## Remove images of text
For V1, I'm going to identify/remove records based on keywords in the alt text that indicate that text is an important part of the image. 

A future version could use OCR, although I think I do want to keep images where text is a small part of the image (eg small caption), so it would ideally be an implementation like `if text area < 15% of total then keep`.

In [47]:
text_keywords = ['reads','says','text','screenshot','\n','screen shot', 'screencap','screen cap','labeled',
                 'username','tweet', 'twitter', 'article', 'receipt',':','-','png','jpg','jpeg','metadata',
                 '@','#','.com','success by','bytes','json', 'success from']


df_alt_text = df_alt_text[~df_alt_text['alt_text'].apply(lambda r: any([kw in r.lower() for kw in text_keywords]))]
df_alt_text.sample(10)

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type
1574741,1335004955267248128,2020-12-04 23:35:59,notnsure how i keep doing this https://t.co/jk...,https://t.co/jk0grVdHEg,http://pbs.twimg.com/media/Eobj7klW8AgiEqH.jpg,cap of soa,photo
1519649,1335010158842417152,2020-12-04 23:56:39,@cesurarts AY PLEASE PLEASE MAKE MORE OF IT ht...,https://t.co/ApToxa2Y1B,http://pbs.twimg.com/media/EoboqcyXYAAqAgm.jpg,KAEYA SUPREMACY,photo
2116275,1334700548444934146,2020-12-04 03:26:22,The timeless elegance of premium cotton in whi...,https://t.co/Lof23iKumU,http://pbs.twimg.com/media/EoXO_JvUUAEDhyJ.jpg,white premium cotton katun warna putih sprei d...,photo
357013,1334332172509962241,2020-12-03 03:02:35,@fredbauerblog The Associated Press wire: http...,https://t.co/fbznIacymf,http://pbs.twimg.com/media/EoR_5gwUwAERMhi.jpg,search results for hr1044 from the AP,photo
997110,1334362205001814017,2020-12-03 05:01:55,"@dilfiplier www.etsy/shop/fabulousfidgets"" htt...",https://t.co/G3I2XczzFu,http://pbs.twimg.com/media/EoSbUyjW4AEtT_q.jpg,a photo of a chew necklace with a business car...,photo
725941,1334520424647057410,2020-12-03 15:30:38,wait imagine phil bought a beard wig like jenn...,https://t.co/HoKHz1zyJK,http://pbs.twimg.com/media/EoUrP6QXIAEFxqc.jpg,jenna marbles in a long beard wig with long ha...,photo
2403282,1333221307836579842,2020-11-30 01:28:24,Automated business tax lodgements from https:/...,https://t.co/uF0ifFPWgy,http://pbs.twimg.com/media/EoCNdnZUYAQGugm.jpg,Automated business tax lodgements,photo
124705,1334184415510065165,2020-12-02 17:15:27,The cozy set is so cozy and cute I love it\n#r...,https://t.co/c5u1iSOHnr,http://pbs.twimg.com/media/EoP5eoRWEAMXEOU.png,Wearing the Cozy set with my friend Zack,photo
421667,1334326481682640898,2020-12-03 02:39:58,They're #BringingBackBilly and now I can be ha...,https://t.co/Tlr0ShdvfI,http://pbs.twimg.com/media/EoR6zWnXEAATxM-.jpg,DePaul CPA Patch with Billy Blue Demon,photo
1487210,1335013060554190860,2020-12-05 00:08:11,Day 4 is GOOD. Gouda is gentle and the Mediter...,https://t.co/zvT2qL4ps3,http://pbs.twimg.com/media/EobrTXRXMAE08KN.jpg,A small square of Mediterranean gouda,photo


In [48]:
df_alt_text.shape

(3005, 7)

In [49]:
df['created_at'].max()

Timestamp('2020-12-05 01:53:43')

In [50]:
df_alt_text.sample(10)

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type
727625,1334520331634139137,2020-12-03 15:30:15,The perfect warning button to wear this holida...,https://t.co/EuARlHrE0b,http://pbs.twimg.com/media/EoUrKpZXEAA2rJX.jpg,A pinback button with an illustration of a sna...,photo
410466,1334327489959657473,2020-12-03 02:43:58,This big goofus is doing his best to cheer me ...,https://t.co/ZTUBp9QDyt,http://pbs.twimg.com/media/EoR7x7CVQAAso_S.jpg,A flame point medium haired cat,photo
1776099,1333575419765944320,2020-12-01 00:55:31,y'all: https://t.co/hw6UuMmZ5H,https://t.co/hw6UuMmZ5H,http://pbs.twimg.com/media/EoHPxitW8AA3jce.jpg,"Image of Sue Sylvester edited to read ""I am go...",photo
1758168,1333879553270407170,2020-12-01 21:04:02,"God, I miss Monterey in the Spring time. \n\n#...",https://t.co/dQKnT5VFsO,http://pbs.twimg.com/media/EoLkYnUW4AQoPto.jpg,Monterey.,photo
1948438,1334715676548759552,2020-12-04 04:26:29,Moon Spooky 👀 https://t.co/N72JtnEo5k,https://t.co/N72JtnEo5k,http://pbs.twimg.com/media/EoXc1O3U8AIt-cm.jpg,Image of the moon with clouds over it,photo
2330682,1334262549324427264,2020-12-02 22:25:55,It’s beginning to look a lot like Christmas 🎶🎅...,https://t.co/Ci77JJOsAp,http://pbs.twimg.com/media/EoRAsyyXEAAuW5Y.jpg,Bedfont Green Dental Practice 2020 Christmas T...,photo
1888590,1334721366814126081,2020-12-04 04:49:06,Hi friend iam shrikant kulat \nPlease help me ...,https://t.co/4GKIyIrvNM,http://pbs.twimg.com/media/EoXh7TqVEAAK0oR.jpg,My you tub channel logo,photo
663966,1334524169313280000,2020-12-03 15:45:30,@justintrimble roadmap..\n\n#btcoffensive http...,https://t.co/xJrzr7Q5A5,http://pbs.twimg.com/media/EoUuVIgXUAYEkLt.jpg,"Alien Ether, 2016?",photo
1899716,1334720299649146880,2020-12-04 04:44:51,lotta talk lately about din wielding the darks...,https://t.co/I4Bnbpw1Ld,http://pbs.twimg.com/media/EoXhCIpXIAAHacK.jpg,sketch of the mandalorian holding the darksabe...,photo
746884,1334519173372928007,2020-12-03 15:25:39,"On this day in 1973, Pioneer 10 became the fir...",https://t.co/7B7QEwH1Br,http://pbs.twimg.com/media/EoUqHR3W8AQ691-.jpg,Artist rendering of Pioneer spacecraft flying ...,photo


## Lowercase and remove punctuation

In [51]:
import re
import string

In [52]:
# # preprocess text

# text = " ".join(caption for caption in df_alt_text['alt_text']).lower()
# # remove punctuation
# text = re.sub(r'[^\w]', ' ',text)
# # remove numbers
# text = re.sub('\w*\d\w*', ' ',text)

In [53]:
# text = " ".join(caption for caption in df_alt_text['alt_text']).lower()
# text = "".join(filter(str.isalpha(),text))
# text[:30]

In [54]:
# text = " ".join(caption for caption in df_alt_text['alt_text']).lower()
# text = re.sub(r'[^a-zA-Z]',' ',text)
# text[:500]

In [55]:
df_alt_text['alt_text_clean'] = df_alt_text['alt_text'].str.replace('[^a-zA-Z ]+', '').str.lower()
df_alt_text.sample(2)

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type,alt_text_clean
2399976,1333221620630994944,2020-11-30 01:29:39,"Sadtember, Octrouble, Lovember, and then I Dre...",https://t.co/7Um4q5CvIK,http://pbs.twimg.com/media/EoCN-hZUYAARv0g.jpg,Lovember playlist,photo,lovember playlist
498137,1334533581113188353,2020-12-03 16:22:54,@bts_bighit niceeee theme😙💜 https://t.co/2TWTf...,https://t.co/2TWTfq1HGP,http://pbs.twimg.com/media/EoU3NgNU0AILyJU.jpg,i like the theme🤧💜,photo,i like the theme


In [56]:
words = nltk.corpus.words.words()
df_alt_text['alt_text_cleaner'] = [' '.join(y for y in x.split() if y in words) for x in df_alt_text['alt_text_clean']]
#df_alt_text['alt_text_cleaner'] = [word for word in df_alt_text['alt_text_clean'].str.split() if word in words]

In [57]:
df_alt_text.head()

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type,alt_text_clean,alt_text_cleaner
747,1333857393801891842,2020-12-01 19:35:59,"@Hallmark I just wanted to buy stuff, not be a...",https://t.co/H617bRVeCV,http://pbs.twimg.com/media/EoLQOiKXYAY52gk.jpg,I’ve never met a more hostile pinecone in my life,photo,ive never met a more hostile pinecone in my life,never met a more hostile in my life
1003,1333857368677859329,2020-12-01 19:35:53,"Happy for Elliot Page, and also relieved https...",https://t.co/RUF0l6aYTT,http://pbs.twimg.com/media/EoLNBYeW8AANYfE.png,"Under ""Trends for Portland, United States,"" ""E...",photo,under trends for portland united states elliot...,under for united is listed as no
2032,1333857294115889154,2020-12-01 19:35:35,I have been doing some color-coding lately. ht...,https://t.co/J0ZU1oCgMI,http://pbs.twimg.com/media/EoLQFU5XIAUta9w.jpg,An Atreus keyboard with multi color keys.,photo,an atreus keyboard with multi color keys,an keyboard with color
2117,1333857290345123844,2020-12-01 19:35:34,'Hats off' to 'mask on'! \n\n📸: lailucien (IG)...,https://t.co/dw9GxtNaxu,http://pbs.twimg.com/media/EoLQCVZXcAQbhkO.jpg,Woman with mask in subway station,photo,woman with mask in subway station,woman with mask in subway station
3013,1333857218282868737,2020-12-01 19:35:17,This is what I have to put up with. https://t....,https://t.co/rWBp9dtj4e,http://pbs.twimg.com/media/EoLP6pzWEAIAVDF.jpg,The cat has plunged himself into the Christmas...,photo,the cat has plunged himself into the christmas...,the cat himself into the tree he is definitely...


In [64]:
df_alt_text['alt_text_clean_len'] = df_alt_text['alt_text_clean'].str.split(" ").str.len()
df_alt_text['alt_text_clean_len'].describe()

count    3005.000000
mean       11.497504
std        13.734878
min         1.000000
25%         4.000000
50%         7.000000
75%        14.000000
max       180.000000
Name: alt_text_clean_len, dtype: float64

In [65]:
df_alt_text.to_clipboard()

In [66]:
# did a bunch of eyeballing in a spreadsheet, and captions with 6 or more words seemd to be a reasonable quality cutoff
df_alt_text = df_alt_text[df_alt_text['alt_text_clean_len'] > 5]
df_alt_text.shape

(1855, 12)

# Remove 'photo/picture of'

In [68]:
df_alt_text['alt_text_clean'] = df_alt_text['alt_text_clean'].str.replace('photo of ','')
df_alt_text['alt_text_clean'] = df_alt_text['alt_text_clean'].str.replace('picture of ','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_alt_text['alt_text_clean'] = df_alt_text['alt_text_clean'].str.replace('photo of ','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_alt_text['alt_text_clean'] = df_alt_text['alt_text_clean'].str.replace('picture of ','')


In [75]:
df_alt_text.sample(5)

Unnamed: 0,id,created_at,tweet_text,tweet_url,img_url,alt_text,media_type,alt_text_clean,alt_text_cleaner,alt_text_cleaner_len,alt_text_len,alt_text_clean_len
1210346,1334653010274889728,2020-12-04 00:17:28,"I got my computer reseted, and all my settings...",https://t.co/uQl3qYwmXZ,http://pbs.twimg.com/media/EoWjPcuXIAIaQ2W.png,"Drawing of my oc, Vanessa. She's just standing...",photo,drawing of my oc vanessa shes just standing there,drawing of my just standing there,6,6,9
1283734,1335032725749678080,2020-12-05 01:26:20,@DevilofAmaranth For your further consideratio...,https://t.co/8W4eCgSEaN,http://pbs.twimg.com/media/Eob9MAhVEAAJlwn.jpg,A moral alignment chart with “unstoppable forc...,photo,a moral alignment chart with unstoppable force...,a moral alignment chart with unstoppable force...,12,12,14
1721792,1333882429795409925,2020-12-01 21:15:28,Black Mirror gave us this gem of a moment: htt...,https://t.co/hHIRAgmMuz,http://pbs.twimg.com/media/EoLm53_XMAUTBC2.png,PSVR2 Exclusive Fucking a Polar Bear,photo,psvr exclusive fucking a polar bear,exclusive a polar bear,4,4,6
1938741,1334716596015816704,2020-12-04 04:30:08,"Dive into the sea, and wander through the fore...",https://t.co/qv4hBpQVfE,http://pbs.twimg.com/media/EoXdq4uXcAI09jg.jpg,A colorful illustration of the letter A covere...,photo,a colorful illustration of the letter a covere...,a colorful illustration of the letter a covere...,32,32,34
1733867,1333881455823495169,2020-12-01 21:11:36,Joe Ross and the Nats have agreed to a one-yea...,https://t.co/wNwxdnERqD,http://pbs.twimg.com/media/EoLl7GIWEAEUci9.jpg,Picture of Joe Ross pitching in a red National...,photo,joe ross pitching in a red nationals uniform,picture of joe ross pitching in a red uniform,9,9,10


## Remove emojis

In [353]:
#demoji.download_codes()

In [354]:
text = demoji.replace(text)

## Remove proper nouns

In [367]:
# identify all proper nouns
tagged_text = pos_tag(text.split())
propernouns = [word for word,pos in tagged_text if pos == 'NNP']
len(propernouns)

25

In [368]:
# check what the propernouns look like... nltk doesn't seem to be doing a great job identifying them
propernouns[:30]

['zell',
 'yoshi',
 'x',
 'knee',
 'xbox',
 'xmas',
 'december',
 'yoda',
 'december',
 'xiumin',
 'kamala',
 'xs',
 'x',
 'xmas',
 'x',
 'xd',
 'december',
 'xmas',
 'donald',
 'j',
 'december',
 'mile',
 'xmas',
 'kana',
 'lovember']

second way of trying this... keep only english words

In [369]:
words = set(nltk.corpus.words.words())
text = " ".join(word for word in nltk.wordpunct_tokenize(text))

In [370]:
text[:1000]

'i ve never met a more hostile pinecone in my life under trends for portland united states elliot is listed as no an atreus keyboard with multi color keys woman with mask in subway station the cat has plunged himself into the christmas tree he is definitely eating it christmas tree on car thining and wisely man working from home wearing a business shirt and tie on top pajama pants on bottom the death of nick carraways innocence as he turns much like i am today close up of mistletoe nestling in a tree small round opaque white berries white person with pink braids heart earrings and pink turtle neck smiling how to make world s best chai milk tea dhaka roadside tea photo of levi s black ripped skinny jeans off of amazon for hope american modeller getting famous after winning global modelling award shy construction sick of being dumb and dumber by myself of funky town dirty ass stinky kitty lingerie panties that are a d i k edu bust of a brunette woman smiling and holding a pen guildy s he

In [341]:
tagged_text = pos_tag(text.split())
propernouns = [word for word,pos in tagged_text if pos == 'NNP']

In [344]:
len(propernouns)

277

# Download Images

cutoff (max timestamp) for first image download: `Timestamp('2020-12-02 22:57:08')`

In [77]:
urls = list(df_alt_text['img_url'])

In [79]:
counter = 0
for i in urls:
#     try:
        filename = 'twitter_images/'+i[27:46]
        response = requests.get(i)
        file = open(filename, "wb")
        file.write(response.content)
        file.close()
        if counter % 20 == 0:
            print(f'{counter} images downloaded.')
        counter +=1
#     except:
#         pass

0 images downloaded.
20 images downloaded.
40 images downloaded.
60 images downloaded.
80 images downloaded.
100 images downloaded.
120 images downloaded.
140 images downloaded.
160 images downloaded.
180 images downloaded.
200 images downloaded.
220 images downloaded.
240 images downloaded.
260 images downloaded.
280 images downloaded.
300 images downloaded.
320 images downloaded.
340 images downloaded.
360 images downloaded.
380 images downloaded.
400 images downloaded.
420 images downloaded.
440 images downloaded.
460 images downloaded.
480 images downloaded.
500 images downloaded.
520 images downloaded.
540 images downloaded.
560 images downloaded.
580 images downloaded.
600 images downloaded.
620 images downloaded.
640 images downloaded.
660 images downloaded.
680 images downloaded.
700 images downloaded.
720 images downloaded.
740 images downloaded.
760 images downloaded.
780 images downloaded.
800 images downloaded.
820 images downloaded.
840 images downloaded.
860 images downlo