In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Read the CSV file
df = pd.read_csv('pictures_input.csv', sep=';')

# Display the contents of the DataFrame
df.columns

Index(['Unnamed: 0', 'alias', 'url', 'multipleImage', 'weekday',
       'numberLikesCategory', 'amount_relevant_tags', 'moving_avg',
       'descriptionProcessed', 'numberPosts',
       ...
       'year', 'years', 'yes', 'yesterday', 'yet', 'yoga', 'you', 'young',
       'youre', 'youtube'],
      dtype='object', length=513)

In [3]:
df['multipleImage'].value_counts()

False    6125
True      203
Name: multipleImage, dtype: int64

In [4]:
df = df[df['multipleImage'] == False]
df.shape

(6125, 513)

In [13]:

# Create the 'pictures' folder if it doesn't exist
if not os.path.exists('pictures'):
    os.makedirs('pictures')

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    url = row['url']
    alias = row['alias']
    folder_name = f'pictures/{alias}'

    # Create the subfolder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Send a GET request to the Instagram post URL
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the image element in the HTML
    image_element = soup.find('meta', property='og:image')
    if image_element:
        image_url = image_element['content']
        alias_group = df[df['alias'] == alias].index.get_loc(index)  # Get the index group for the current alias
        image_number = len(df[(df['alias'] == alias) & (df.index < index)])  # Get the image number within the alias group

        # Download the image
        response = requests.get(image_url)
        filename = f'{image_number}.jpg'
        file_path = os.path.join(folder_name, filename)
        with open(file_path, 'wb') as file:
            file.write(response.content)

        print(f"Downloaded {filename} for alias {alias}")
        df.at[index, 'error'] = False
        df.at[index, 'path'] = file_path.replace('\\', '/')  # Store the file path in the 'path' column
    else:
        df.at[index, 'error'] = True  # Set 'error' column to True for the corresponding row
        print(f"No image found for alias {alias}")


No image found for alias elisabeth.rioux
No image found for alias elisabeth.rioux
No image found for alias elisabeth.rioux
Downloaded 3.jpg for alias elisabeth.rioux
Downloaded 4.jpg for alias elisabeth.rioux
No image found for alias elisabeth.rioux
Downloaded 6.jpg for alias elisabeth.rioux
Downloaded 7.jpg for alias elisabeth.rioux
No image found for alias elisabeth.rioux
Downloaded 9.jpg for alias elisabeth.rioux
No image found for alias elisabeth.rioux
Downloaded 11.jpg for alias elisabeth.rioux
Downloaded 0.jpg for alias _picolo
Downloaded 1.jpg for alias _picolo
Downloaded 2.jpg for alias _picolo
Downloaded 3.jpg for alias _picolo
Downloaded 4.jpg for alias _picolo
Downloaded 5.jpg for alias _picolo
No image found for alias _picolo
Downloaded 7.jpg for alias _picolo
No image found for alias _picolo
No image found for alias _picolo
Downloaded 0.jpg for alias cacatengker
Downloaded 1.jpg for alias cacatengker
Downloaded 2.jpg for alias cacatengker
Downloaded 3.jpg for alias cacaten

In [14]:
df.to_csv('df.csv', index=False)

In [15]:
df['error'] = df['error'].fillna(False)
df['error'].value_counts()

False    4849
True     1276
Name: error, dtype: int64

In [16]:
print(df[['error', 'path']])

      error                            path
0      True                             NaN
1      True                             NaN
2      True                             NaN
3     False  pictures/elisabeth.rioux/3.jpg
4     False  pictures/elisabeth.rioux/4.jpg
...     ...                             ...
6323   True                             NaN
6324   True                             NaN
6325   True                             NaN
6326   True                             NaN
6327   True                             NaN

[6125 rows x 2 columns]


In [17]:
filtered_df = df[df['error'] == False]


In [22]:
filtered_df[['descriptionProcessed', 'url','path']]

Unnamed: 0,descriptionProcessed,url,path
3,Missing much island think beautiful Caribbean ...,https://www.instagram.com/p/BTSIGXaDcZ7/?taken...,pictures/elisabeth.rioux/3.jpg
4,Small nose small cheeks big lips big eyes flee...,https://www.instagram.com/p/BTChWhoDkH1/?taken...,pictures/elisabeth.rioux/4.jpg
6,Current situation nothing picture landed Can...,https://www.instagram.com/p/BS9eOQnDdwY/?taken...,pictures/elisabeth.rioux/6.jpg
7,Mood,https://www.instagram.com/p/BSjdOWHDd1t/?taken...,pictures/elisabeth.rioux/7.jpg
9,would mention k k Thank k really expectin...,https://www.instagram.com/p/BTFKUt1jqgx/?taken...,pictures/elisabeth.rioux/9.jpg
...,...,...,...
6316,right fresh ham egg cheese tacos fresh homemad...,https://www.instagram.com/p/BSvuwZPAp8E/?taken...,pictures/plriley/8.jpg
6317,Guys summer favorite cant wait till,https://www.instagram.com/p/BTRoGBKgVa8/?taken...,pictures/plriley/9.jpg
6318,Local market Santiago Ixcuintla Nayarit Mexico,https://www.instagram.com/p/BTBv3A9AoBu/?taken...,pictures/plriley/10.jpg
6319,snow sleeping snow winter,https://www.instagram.com/p/BR0jOu3Ac5k/?taken...,pictures/drmusatokmak/0.jpg


In [26]:
filtered_df = filtered_df.drop(['error', 'multipleImage'], axis=1)
filtered_df.to_csv('final_data_pictures.csv', sep=';', index=False)

In [27]:
filtered_df.shape

(4849, 513)