# Imports, API Object, Retrieving Tweets:

In [1]:
import os
import requests
import json 
from dotenv import load_dotenv
import time
load_dotenv()
import pandas as pd 
import csv
import tweepy
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import re
import numpy as np

In [2]:
twitter_api_key = os.getenv("TWITTER_API_KEY")
twitter_secret_key = os.getenv("TWITTER_SECRET_KEY")
twitter_access_token = os.getenv("TWITTER_ACCESS_TOKEN")
twitter_secret_token = os.getenv("TWITTER_SECRET_TOKEN")

In [3]:
def oAuth():
    try:
        auth = tweepy.OAuthHandler(twitter_api_key,twitter_secret_key)
        auth.set_access_token(twitter_access_token,twitter_secret_token)
        return auth
    except Exception as e:
        return None

oauth= oAuth()

tweepy_api = tweepy.API(oauth)

In [4]:
def get_new_tweets(names):
    print("Retrieving tweets")
    corpus = []                                                                                        
    for name in names:
        tweets = tweepy_api.user_timeline(screen_name = name, include_rts=False, count=5, tweet_mode="extended", exclude_replies = True)          
        time.sleep(4)
        corpus.extend(tweets)                                                                          
    data = [[tweet.id_str, tweet.user.screen_name, tweet.full_text, tweet.created_at] for tweet in corpus]
    tweets = pd.DataFrame(data, columns=['tweet_id', 'screen_name', 'text', 'timestamp'])                

    return tweets

In [5]:
# Change this to whoever you want
screen_names = ['CryptoKaleo']

In [6]:
user_tweets = get_new_tweets(screen_names)

Retrieving tweets


In [7]:
user_tweets.head()

Unnamed: 0,tweet_id,screen_name,text,timestamp
0,1655612006605504513,CryptoKaleo,Majority of CT followers came in 2021 / 2022 s...,2023-05-08 16:33:56+00:00
1,1655604074614538243,CryptoKaleo,I used to struggle conceptualizing how #Bitcoi...,2023-05-08 16:02:25+00:00
2,1655592953136525314,CryptoKaleo,I swear man every time I start getting back in...,2023-05-08 15:18:14+00:00
3,1655592717810900992,CryptoKaleo,GOOODDD MORNING CRYPTO TWITTER,2023-05-08 15:17:18+00:00
4,1654687332899909632,CryptoKaleo,Love you guys,2023-05-06 03:19:37+00:00


##  Data Exploration, Retrieving Replies, Creating a Dataframe

you can copy paste the screen_name and tweet_id from the dataframe above to search the replies of any tweet pulled



In [20]:

name = 'CryptoKaleo'
tweet_id = '1655612006605504513'

Result_type can be 'recent', 'popular', 'mixed':

* Mixed seems to be the best option to get the highest amount of replies from the query
* Not guaranteed to get 'hidden' replies which are often the bot replies

In [23]:
replies=[]
for tweet in tweepy.Cursor(tweepy_api.search_tweets,q='to:'+name, result_type = 'recent', tweet_mode = 'extended').items(100):
    if hasattr(tweet, 'in_reply_to_status_id_str'):
        if (tweet.in_reply_to_status_id_str==tweet_id):
            replies.append(tweet)

In [24]:
len(replies)

4

Text from replies:

In [25]:
for i in range(0,5):
    try:
        print(f"Tweet {i}: {replies[i].full_text}")
    except:
        pass

Tweet 0: @CryptoKaleo good..
Tweet 1: @CryptoKaleo You want followers who do not actively join Twitter?

A community is active and populated. 

Thinking about it, an account that follows you, no longer on Twitter is a false reading? Or you prefer the statistical record to add up and be counted. 

Perhaps they could purge the… https://t.co/57TuUUHTqc
Tweet 2: @CryptoKaleo This should be done randomly at least twice annually for any accounts inactive for 2 years
Tweet 3: @CryptoKaleo It’s almost time to get into $kevin just like when all NFT’s where dropping. All money flowed into @kevintoken so get ready to jump


Get different pieces of info like so:

In [26]:
print(f"Tweet Text: {replies[0].full_text}")
print(f"Users @: {replies[0].entities['user_mentions']}")
print(f"Account Created At: {replies[0].user.created_at}")
print(f"Followers: {replies[0].user.followers_count}")

Tweet Text: @CryptoKaleo good..
Users @: [{'screen_name': 'CryptoKaleo', 'name': 'K A L E O', 'id': 906234475604037637, 'id_str': '906234475604037637', 'indices': [0, 12]}]
Account Created At: 2019-09-30 20:15:46+00:00
Followers: 1397


In [27]:
text = []
screen_name = []
followers = []
following= [] #friends-count
account_age = []
verified = []
tweet_count = [] #statuses count
default_profile_image = []
user_mentions = []
linked_urls = []
reply_time = []
has_hashtag =[]

In [28]:
for i in range(len(replies)):
    if '#' in replies[i].full_text:
        has_hashtag.append('yes')
    else:
        has_hashtag.append('no') 

In [29]:
for i in range(len(replies)):
    text.append(replies[i].full_text)
    screen_name.append(replies[i].user.screen_name)
    followers.append(replies[i].user.followers_count)
    following.append(replies[i].user.friends_count)
    account_age.append(replies[i].user.created_at)
    verified.append(replies[i].user.verified)
    tweet_count.append(replies[i].user.statuses_count)
    default_profile_image.append(replies[i].user.default_profile_image)
    user_mentions.append(len(replies[i].entities['user_mentions'])) # Determines # of user mentions
    linked_urls.append(len(replies[i].entities['urls'])) # Determines if they linked a URL
    reply_time.append(replies[i].created_at) 

In [30]:
df = pd.DataFrame(
    {'text': text,
     'screen_name': screen_name,
     'followers':followers,
     'following':following,
     'account_age': account_age,
     'verified': verified,
     'tweet_count':tweet_count,
     'default_prof_img':default_profile_image,
     'user_mentions': user_mentions,
     'linked_urls' : linked_urls,
     'reply_time': reply_time,
     'has_hashtag' : has_hashtag
    })

In [31]:
df.head()

Unnamed: 0,text,screen_name,followers,following,account_age,verified,tweet_count,default_prof_img,user_mentions,linked_urls,reply_time,has_hashtag
0,@CryptoKaleo good..,hexorangutan,1397,286,2019-09-30 20:15:46+00:00,False,23273,False,1,0,2023-05-08 19:21:20+00:00,no
1,@CryptoKaleo You want followers who do not act...,MannyMVK,681,1404,2019-02-06 18:27:55+00:00,False,9574,False,1,1,2023-05-08 19:15:06+00:00,no
2,@CryptoKaleo This should be done randomly at l...,CryptoJeffS,1093,2072,2018-06-17 15:26:45+00:00,False,25928,False,1,0,2023-05-08 17:56:48+00:00,no
3,@CryptoKaleo It’s almost time to get into $kev...,CryptoPvZ,88,165,2022-05-12 15:11:52+00:00,False,2214,False,2,0,2023-05-08 17:41:41+00:00,no


## Data Cleanup, Adding Relevant Columns, Text Cleanup:


In [32]:
# Set time variable for when the original tweet was tweeted
tweet_time = user_tweets['timestamp'][0]

In [33]:
# Refers to original tweet creation time from earlier variable
df['tweet_time'] = tweet_time

In [34]:
# Calculates how long after the tweet was sent, the user replied to the tweet
df['time_to_respond_minutes'] = (user_tweets['timestamp'][0] - df['reply_time']).astype('timedelta64[m]')*-1

In [35]:
df.head()

Unnamed: 0,text,screen_name,followers,following,account_age,verified,tweet_count,default_prof_img,user_mentions,linked_urls,reply_time,has_hashtag,tweet_time,time_to_respond_minutes
0,@CryptoKaleo good..,hexorangutan,1397,286,2019-09-30 20:15:46+00:00,False,23273,False,1,0,2023-05-08 19:21:20+00:00,no,2023-05-08 16:33:56+00:00,168.0
1,@CryptoKaleo You want followers who do not act...,MannyMVK,681,1404,2019-02-06 18:27:55+00:00,False,9574,False,1,1,2023-05-08 19:15:06+00:00,no,2023-05-08 16:33:56+00:00,162.0
2,@CryptoKaleo This should be done randomly at l...,CryptoJeffS,1093,2072,2018-06-17 15:26:45+00:00,False,25928,False,1,0,2023-05-08 17:56:48+00:00,no,2023-05-08 16:33:56+00:00,83.0
3,@CryptoKaleo It’s almost time to get into $kev...,CryptoPvZ,88,165,2022-05-12 15:11:52+00:00,False,2214,False,2,0,2023-05-08 17:41:41+00:00,no,2023-05-08 16:33:56+00:00,68.0


In [36]:
df = df.drop(['tweet_time','reply_time'],axis= 1)

In [37]:
# Create a variable for todays date to calculate how old the account is
today = pd.Timestamp.now()
today = today.date()

In [38]:
#calculates the accounts age in days

df['account_age_days'] = today - df['account_age'][i].date()
for i in range(len(replies)):    
    df['account_age_days'][i] = today - df['account_age'][i].date()
    df['account_age_days'][i] = df['account_age_days'][i].days

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [39]:

df['account_age_days'] = df['account_age_days'].astype('float64')

In [40]:

df = df.drop(['account_age'],axis= 1)

Adding a column for follower to following ratio --> May be helpful

In [41]:

df['follwers_to_following_ratio'] = (df['followers']/df['following']).round(2)

Adding a column for average tweets per day

In [42]:
df['avg_tweets_per_day'] = (df['tweet_count']/df['account_age_days']).round(2)

Removes special characters, numbers, links, etc. from tweet text

In [43]:
def clean_tweets(text):
    text = text.lower()
    text = re.sub("@[A-Za-z0-9_]+","", text)
    text = re.sub("#[A-Za-z0-9_]+","", text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"www.\S+", "", text)
    text = re.sub('[()!?]', ' ', text)
    text = re.sub('\[.*?\]',' ', text)
    text = re.sub("[^a-z0-9]"," ", text)
    return text

df['clean_text'] = df['text'].apply(clean_tweets)
df = df.drop('text',axis=1)

Important Note:

* IF 'user_mentions' > 1, then that means they mentioned someone else other than the original tweet poster in their reply. This is indicative of spam
* The default number is 1 here, because it's counting the reply '@original_tweeter'(reply text)
* IF 'linked_urls' > 0, that means they have linked something in their reply. Also indicative of a spam reply
* Expect lower time to respond to be indicative of spam, as many of these accounts are automated using the twitter API
* Strangely, it seems that people are buying older twitter accounts to spam reply with, as it makes it harder to detect as spam if the account
has been around for a while

In [44]:
df.head()

Unnamed: 0,screen_name,followers,following,verified,tweet_count,default_prof_img,user_mentions,linked_urls,has_hashtag,time_to_respond_minutes,account_age_days,follwers_to_following_ratio,avg_tweets_per_day,clean_text
0,hexorangutan,1397,286,False,23273,False,1,0,no,168.0,1316.0,4.88,17.68,good
1,MannyMVK,681,1404,False,9574,False,1,1,no,162.0,1552.0,0.49,6.17,you want followers who do not actively join t...
2,CryptoJeffS,1093,2072,False,25928,False,1,0,no,83.0,1786.0,0.53,14.52,this should be done randomly at least twice a...
3,CryptoPvZ,88,165,False,2214,False,2,0,no,68.0,361.0,0.53,6.13,it s almost time to get into kevin just like...


## Writing to an Excel file, so we can gather as many entries as possible, before making a ML model:

In [None]:
import pandas as pd

with pd.ExcelWriter('Twitter_data.xlsx', mode= 'a', engine='openpyxl', if_sheet_exists='new') as writer:
    df.to_excel(writer)

In [None]:
for i in range(len(replies)):
    print(f"Tweet {i}: {replies[i].full_text}")