In [53]:
import pandas as pd
import os
import sys
import pymongo
import requests
import translators as ts
from langdetect import detect
from json_parsers import *
from sqlalchemy import create_engine
from pymongo import MongoClient


In [30]:
MONGO_URI = os.getenv("MONGO_URI")

client_db =  'stream'
client_col = 'frenchgp' #args.collection

# source and target connections
client = MongoClient(MONGO_URI)

# batch size
batch_size = 1000

# database and collection names
db = client[client_db]
col = db[client_col]

# IMPORT

In [110]:
df_in = pd.DataFrame(list(col.find({})))
df_in = df_in.drop(['matching_rules'],1)

# DEFINE FUNCTIONS

In [99]:
def getData(df):
    """
    Convert the nexted json inside the data column to its own dataframe
    """
    df['author_id'] = df.apply(lambda df: parse_json(df, 'data', 'author_id'), 1)
    df['created_at'] = df.apply(lambda df: parse_json(df, 'data', 'created_at'), 1)
    df['geo'] = df.apply(lambda df: parse_json(df, 'data', 'geo'), 1)
    df['tweet_id'] = df.apply(lambda df: parse_json_exact(df, 'data', 'id'), 1)
    df['raw_text'] = df.apply(lambda df: parse_json(df, 'data', 'text'), 1)
    df = df[['tweet_id','author_id','created_at','raw_text']]
    return df


 def getTranslation(df):
    clean_text = df['clean_text']
    translated_text = ts.google(clean_text, if_use_cn_host=True)
    return translated_text


def getCleanText(df):
    clean_text = ''.join(e for e in df['raw_text'] if e.isascii())
    clean_text = ''.join(e for e in clean_text if e not in ["!", "@", "#"])
    return clean_text


def getLanguage(df):
    clean_text = ''.join(e for e in df['raw_text'] if e.isascii())
    clean_text = ''.join(e for e in clean_text if e not in ["!", "@", "#"])
    language = detect(clean_text)
    return language   


def getUsers(df):
    users = df['includes']['users']
    return users


def getUserDataframe(df):
    df['user_created_at'] = df.apply(lambda df: parse_json(df, 'users', 'created_at'), 1)
    df['user_id'] = df.apply(lambda df: parse_json(df, 'users', 'id'), 1)
    df['location'] = df.apply(lambda df: parse_json(df, 'users', 'location'), 1)
    df['name'] = df.apply(lambda df: parse_json_exact(df, 'users', 'name'), 1)
    df['username'] = df.apply(lambda df: parse_json(df, 'users', 'username'), 1)
    df = df.drop(['users'],1)
    return df

Unnamed: 0,tweet_id,author_id,created_at,raw_text
0,1405185477834051584,1364879051438002176,2021-06-16T15:28:19.000Z,Esteban Ocon signs new Alpine F1 deal until 20...
1,1405185576085733380,1350309702,2021-06-16T15:28:43.000Z,BIG UP TO ESTEBAN OCON WHOSE SIGNED A NEW 3 YE...
2,1405185596771995653,1368546012067995650,2021-06-16T15:28:48.000Z,Neuer Dreijahresvertrag: Esteban Ocon verlänge...
3,1405185683753422851,4843529163,2021-06-16T15:29:08.000Z,"#Formula1 , Esteban #Ocon rinnova con #Alpine ..."
4,1405185708323741697,493042610,2021-06-16T15:29:14.000Z,"#F1: ufficiale, #Ocon pilota #Alpine fino al 2..."
5,1405185726921252866,831236353585246210,2021-06-16T15:29:19.000Z,Alpine heeft de line-up voor volgend jaar rond...
6,1405185742696075265,1357714434265915393,2021-06-16T15:29:22.000Z,French driver signs contract extension with a ...
7,1405185757149605888,1129878879143911426,2021-06-16T15:29:26.000Z,OFICIAL: Esteban Ocon seguirá con @AlpineF1Tea...
8,1405185763189301248,1318271660307472385,2021-06-16T15:29:27.000Z,Esteban Ocon has signed a 3-year deal with Alp...
9,1405185780444712960,3308385844,2021-06-16T15:29:31.000Z,Will Esteban Ocon ever be good enough to fight...


# CLEAN TWEET DATA / TRANSLATE

In [121]:
%%time
data_df = getData(df_in)
data_df['clean_text'] = data_df.apply(lambda data_df: getCleanText(data_df), 1)
data_df['language'] = data_df.apply(lambda data_df: getLanguage(data_df), 1)
data_df_translate = data_df[data_df['language'] != 'en'][['tweet_id','clean_text']]
data_df_translate['translated_text'] = data_df_translate.apply(lambda data_df_translate: getTranslation(data_df_translate), 1)
data_df = data_df \
            .merge(data_df_translate, on='tweet_id', how='left') \
            .rename(index=str, columns={'clean_text_x': 'clean_text'}) \
            .drop('clean_text_y', 1)
data_df['translated_text'] = data_df['translated_text'].fillna(data_df['clean_text'])
data_df = data_df[['tweet_id','author_id','created_at','language','raw_text','clean_text','translated_text']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
CPU times: user 253 ms, sys: 17.3 ms, total: 271 ms
Wall time: 7.75 s


# SEND TO POSTGRES

In [122]:
engine = create_engine('postgresql://postgres@localhost:5432/frenchgp')
data_df.to_sql('tweet', engine)

# USERS MENTIONED

In [124]:
df_in['users'] = df_in.apply(lambda df_in: getUsers(df_in), 1)

In [125]:
users_exploded = df_in[['tweet_id','users']].explode('users')
users_df = getUserDataframe(users_exploded)

In [130]:
def escapeArray(df, column):
    if len(df[column]) == 0:
        return ''
    else:
        return df[column]

users_df['location'] = users_df.apply(lambda users_df: escapeArray(users_df, 'location'), 1)
users_df

Unnamed: 0,tweet_id,created_at,user_id,location,name,username
0,1405185477834051584,2021-02-25T10:05:11.000Z,1364879051438002176,India,Khel ख़बर,Khel17934254
1,1405185576085733380,2013-04-13T21:55:37.000Z,1350309702,"Stoke Ash, England",Steven Featley,SFeatley
2,1405185596771995653,2021-03-07T12:56:52.000Z,1368546012067995650,"Unterföhring, Deutschland",Sky Sport Formel 1,skysportformel1
3,1405185683753422851,2016-01-24T22:43:30.000Z,4843529163,Italia,Sportface,sportface2016
4,1405185708323741697,2012-02-15T11:38:09.000Z,493042610,,FormulaPassion.it,FormulaPassion
5,1405185726921252866,2017-02-13T20:19:25.000Z,831236353585246210,,Ronald Vording,RonaldVording
6,1405185742696075265,2021-02-05T15:35:34.000Z,1357714434265915393,,F1 Ramble,F1Ramble
7,1405185757149605888,2019-05-18T22:38:11.000Z,1129878879143911426,"Bogotá, D.C., Colombia",Valentina Peña Orozco,valentinapena98
7,1405185757149605888,2009-04-02T07:08:31.000Z,28297965,,Alpine F1 Team,AlpineF1Team
8,1405185763189301248,2020-10-19T19:23:56.000Z,1318271660307472385,Worldwide,ASN Motorsports,AsnMotorsports


# SEND TO POSTGRES

In [None]:
users_df.to_sql('users_mentioned', engine)