# Analyzing Instagram trend data

## Our analysis subjects:

1. The most profiles per city?
2. The most posts per city?
3. The most posts per profile?
4. The most likes per profile?


## Verificando versão python

In [1]:
#Versao da Linguagem Python
from platform import python_version
print('Versao da Linguagem Python Usada neste Jupyter Notebook', python_version())

Versao da Linguagem Python Usada neste Jupyter Notebook 3.9.1


## Particionando arquivo posts 


In [None]:
import os
import shutil
import csv

# caminho do arquivo grande
file_path = r'E:\Kaggle\input\Instagram\instagram_posts.csv'

# tamanho de cada arquivo menor em bytes
chunk_size = 10000000

# abrir arquivo grande em modo texto
with open(file_path, 'r', newline='', encoding='utf-8') as f:
    # criar leitor CSV
    reader = csv.reader(f)
    # ler cabeçalho
    header = next(reader)
    # inicializar contador de chunk
    chunk_num = 0
    while True:
        # ler chunk do arquivo
        chunk = ''
        current_size = 0
        for row in reader:
            row_str = ','.join(row) + '\n'
            row_size = len(row_str.encode('utf-8'))
            if current_size + row_size > chunk_size:
                break
            chunk += row_str
            current_size += row_size
        else:
            # fim do arquivo, sair do loop
            if not chunk:
                break
        # nome do arquivo de chunk
        chunk_name = f'{os.path.splitext(file_path)[0]}_{chunk_num}.csv'
        # criar arquivo de chunk e escrever cabeçalho e chunk nele
        with open(chunk_name, 'w', newline='', encoding='utf-8') as chunk_file:
            writer = csv.writer(chunk_file)
            writer.writerow(header)
            chunk_file.write(chunk)
        # incrementar contador de chunk
        chunk_num += 1

## Carregando dataframes e definindo schemas

In [2]:
#Importando o módulo csv
import pandas as pd
import numpy  as np

def location_schema():
    schema = pd.DataFrame({
        "sid": pd.Series(dtype="Int64"),
        "id": pd.Series(dtype="Int64"),
        "name": pd.Series(dtype="string"),
        "street": pd.Series(dtype="string"),
        "zip": pd.Series(dtype="string"),
        "city": pd.Series(dtype="string"),
        "region": pd.Series(dtype="string"),
        "cd": pd.Series(dtype="string"),
        "phone": pd.Series(dtype="string"),
        "aj_exact_city_match": pd.Series(dtype="string"),
        "aj_exact_country_match": pd.Series(dtype="string"),
        "blurb": pd.Series(dtype="string"),
        "dir_city_id": pd.Series(dtype="string"),
        "dir_city_name": pd.Series(dtype="string"),
        "dir_city_slug": pd.Series(dtype="string"),
        "dir_country_id": pd.Series(dtype="string"),
        "dir_country_name": pd.Series(dtype="string"),
        "lat": pd.Series(dtype="string"),
        "lng": pd.Series(dtype="string"),
        "primary_alias_on_fb": pd.Series(dtype="string"),
        "slug": pd.Series(dtype="string"),
        "website": pd.Series(dtype="string"),
        "cts": pd.Series(dtype="string")
    })
    return schema

def profile_schema():
    schema = pd.DataFrame({
        "sid": pd.Series(dtype="Int64"),
        "profile_id": pd.Series(dtype="Int64"),
        "profile_name": pd.Series(dtype="string"),
        "firstname_lastname": pd.Series(dtype="string"),
        "description": pd.Series(dtype="string"),
        "following": pd.Series(dtype="string"),
        "followers": pd.Series(dtype="string"),
        "n_posts": pd.Series(dtype="string"),
        "url": pd.Series(dtype="string"),
        "cts": pd.Series(dtype="string"),
        "is_business_account": pd.Series(dtype="string")
    })
    return schema

def post_schema():
    schema = pd.DataFrame({
        "sid": pd.Series(dtype="Int64"),
        "sid_profile": pd.Series(dtype="string"),
        "post_id": pd.Series(dtype="string"),
        "profile_id": pd.Series(dtype="string"),
        "location_id": pd.Series(dtype="string"),
        "cts": pd.Series(dtype="string"),
        "post_type": pd.Series(dtype="string"),
        "description": pd.Series(dtype="string"),
        "numbr_likes": pd.Series(dtype="string"),
        "number_comments": pd.Series(dtype="string")
    })
    return schema

In [7]:
#Reading csv and put the information inside of data frame 
profile_file = r'E:\Kaggle\input\Instagram\instagram_profiles.csv'
cities_file  = r'E:\Kaggle\input\Instagram\instagram_locations.csv'
posts_file   = r'E:\Kaggle\input\Instagram\instagram_posts_1.csv'
ec = 'latin'
delimiter = '\t'

df_cities   = pd.read_csv(cities_file
,delimiter=delimiter
,encoding=ec)

df_profiles = pd.read_csv(profile_file
,delimiter=delimiter
,encoding=ec)

df_posts = pd.DataFrame()

for chunk in pd.read_csv( posts_file
    ,delimiter=delimiter #delimter
    ,encoding=ec #encoding
    ,error_bad_lines=False #Exlude badlines
    ,names=['sid','sid_profile','post_id','profile_id','location_id','cts','post_type','description','numbr_likes','number_comments'] 
    ,header=0#header
    ,quotechar='"'
    ,chunksize=1000000 #Leitura de arquivo particionado
    #,dtype=post_schema() #Aplica scheam
    ):
    chunk = chunk.astype(post_schema().dtypes) #Aplica o schema em cada pedaco(chunk do arquivo)
    df_posts = pd.concat([df_posts,chunk])# Reune todos os pedacos no data frame principal




## Aplicando schema nos dataframes


In [4]:
df_cities = df_cities.astype(location_schema().dtypes)
df_profiles = df_profiles.astype(profile_schema().dtypes)
df_posts = df_posts.astype(post_schema().dtypes)

AttributeError: 'TextFileReader' object has no attribute 'astype'

## The most profiles per city

In [None]:
city_group = df_cities.groupby('city').agg({'id':'count'})
city_group = city_group.sort_values(by='id', ascending =False).rename(columns={'id':'QTD_PROFILES','city':'CITY'})

city_group.head()

## The most posts per city

In [None]:
#Removendo linhas nulas para realizar o relacionamento
#Observe que o parâmetro errors='coerce' é usado no método to_numeric(). 
# Isso faz com que os valores que não puderem ser convertidos em números sejam convertidos em NaN
df_posts['sid'] = pd.to_numeric(df_posts['sid'],errors='coerce') 
df_posts = df_posts.dropna(subset=['sid'])

#Fazendo join pelo sid
merged_df = pd.merge(df_cities,df_posts, on='sid', how='inner')
merged_df.head()


#posts_group = df_posts.groupby('sid').agg({'post_id':'count'})
#posts_group= posts_group.sort_values(by='post_id',ascending=True).rename(columns={'post_id':'QTD_POSTS'})

