## Music Genre Sentiment Analysis

Julie Bazalewski

Step 1: Import Modules

In [1]:
import tweepy
import pandas as pd
#https://textblob.readthedocs.io/en/dev/api_reference.html#textblob.blob.TextBlob.sentiment
from textblob import TextBlob
#https://pypi.org/project/demoji/
import demoji
import re
import string
import numpy as np

Step 2: Set up Twitter Credentials

In [2]:
%run ~/twitter_credentials.py

In [3]:
#Use tweepy.OAuthHandler to create an authentication using the given key and secret
auth = tweepy.OAuthHandler(consumer_key=con_key, consumer_secret=con_secret)
auth.set_access_token(acc_token, acc_secret)

#Connect to the Twitter API using the authentication
api = tweepy.API(auth)

Step 3: Define Functions

In [4]:
def get_tweets(search_list,num_needed):
    
    """
   Obtains specified number of tweets from Twitter for given search terms
    
    Parameters
    ----------
    search_list: list
    List of search terms
    
    num_needed: int
    Minimum number of desired tweets for each search term
    
    Returns
    -------
    full_tweet_list: list
    A nested list containing tweet data for each search term
    """

    full_tweet_list = []

    for i in range(len(search_list)):  #loop over number of search terms
    
        tweet_list = []  #reset list
        last_id = -1   # id of last tweet seen
        print('running: {}'.format(search_list[i])) #print search iteration
    
        while len(tweet_list) < num_needed:   #while number of tweets is below desired amount:
            try:                              #try to obtain 100 more tweets
                new_tweets = api.search(q=search_list[i], lang='en', tweet_mode='extended', \
                                        count = 100, max_id = str(last_id - 1))

            except tweepy.TweepError as e:  #print error if reach limit
                print("Error", e)
                break
        
            else:
                if not new_tweets: #print error if not enough tweets can be obtained
                    print("Could not find any more tweets!")
                    break
                
                tweet_list.extend(new_tweets) #add new tweets from this iteration to list
                last_id = new_tweets[-1].id
    
        full_tweet_list.append(tweet_list)  #append all tweets for current search term to new position in full list
        
    return full_tweet_list

In [5]:
def tweet_dict_pop(full_tweet_list,search_terms):
    
    """
    Populates dictionary with relevant tweet data
    
    Parameters
    ----------
    tweet_list: list
    Nested list of tweet data for each search term
    
    search_terms: list
    List of search term names
    
    Returns
    -------
    tweet_dict: dictionary
    A nested dictionary containing text, username, and 
    location data for each tweet for each search term
    """
    
    tweet_dict = {}
    
    for i in range(len(search_terms)): #for each search term:
        
        search_term = search_terms[i]
        
        if search_term not in tweet_dict:   #if key does not exist:
            tweet_dict[search_term] = {}    # initialize key
        for tweet in full_tweet_list[i]:  #for each tweet:
        
            if 'text' not in tweet_dict[search_term]:   #if key does not exist:
                tweet_dict[search_term]['text'] = []    # initialize key
            tweet_dict[search_term]['text'].append(tweet.full_text)  #append tweet text to key 'text'
        
            if 'user' not in tweet_dict[search_term]:  #if key does not exist:
                tweet_dict[search_term]['user'] = []     # initialize key
            tweet_dict[search_term]['user'].append(tweet.user.screen_name) #append username to key 'user'       
        
            if 'location' not in tweet_dict[search_term]:  #if key does not exist:
                tweet_dict[search_term]['location'] = []   # initialize key
            tweet_dict[search_term]['location'].append(tweet.user.location) #append location to key 'location'   
            
        tweet_dict[search_term]['genre'] = search_term #add genre key to each nested dict
        
    return tweet_dict

In [6]:
def dict_to_df(tweet_dict,search_terms):
    
    """
    Creates Pandas data frame from nested dictionary
    
    Parameters
    ----------
    tweet_dict: dictionary
    Nested dictionary of tweet data
    
    search_terms: list
    List of search term names
    
    Returns
    -------
    tweet_df: data frame
    Data Frame converted from input dictionary
    """
    
    tweet_df = pd.DataFrame()
    
    for search_term in search_terms:
        tweet_df = tweet_df.append(pd.DataFrame(tweet_dict[search_term]))

    tweet_df = pd.DataFrame.reset_index(tweet_df)
    tweet_df = tweet_df.drop('index', axis=1)
    
    return tweet_df

In [7]:
def replace_punct(s):
    
    """
    Removes punctuation from a string
    
    Parameters
    ----------
    s: string
    String to be cleaned
    
    Returns
    -------
    s: string
    Cleaned string
    """
        
    new_s = []
    #check if each character is punctuation. If not, append to new list
    [new_s.append(s[i]) for i in range(len(s)) if s[i] not in list(string.punctuation + "‘’”“…←♬")]
    
    #flatten new, cleaned list and save to original string
    s=''.join(new_s)
    
    return s

In [8]:
def clean_tweets(s):
    
    """
    Cleans a given string. Removes URLS, emojis, EOL characters, punctuation
    
    Parameters
    ----------
    s: string
    String to be cleaned
    
    Returns
    -------
    s: string
    Cleaned string
    """
        
    s = re.sub(r'http\S+', ' ', s)  #remove urls
    s = re.sub(r'@[A-Za-z0-9_]+', ' ', s) #remove mentions, usernames can contain letters, numbers, and underscore
    s = demoji.replace(s)  #remove emojis 
    s = s.replace('\n', ' ').replace('\t',' ').replace('  ',' ')  #remove new lines, tabs, double spaces
    s = replace_punct(s) #remove punctuation
    return s

In [9]:
def score_sentiment(s):
    
    """
    Determines sentiment scoe of a given string 
    from -1 to 1 (most negative to most positive)
    
    Parameters
    ----------
    s: string
    String to be analyzed
    
    Returns
    -------
    score: float
    Polarity score for input string
    """    
    
    score = TextBlob(s).polarity #use TextBlob package to determine tweet polarity
    return score

In [10]:
def write_csv_length(tweet_df,print_count,tweet_counts):
    
    """
    Writes .csv of specified length with equal (or near equal) counts for each search term
    
    Parameters
    ----------
    tweet_df: data frame
    Data Frame of tweet data
    
    print_count: int
    Total number of tweets to write
    
    tweet_counts: list
    List of number of tweets of each search term
    
    Returns
    -------
    short_df: data frame
    Data Frame with specified amount of tweets
    """
    
    iterations = len(tweet_counts)
    tweets_per_search = int(np.floor(print_count/iterations))
    extra = print_count%iterations
    
    short_df = pd.DataFrame()
    short_df = tweet_df.iloc[1: (tweets_per_search + 1)]
    counter = tweet_counts[0]
    
    for i in range(1,iterations):

        if i == iterations - 1:
            short_df = short_df.append(tweet_df.iloc[counter:counter + (tweets_per_search + extra)])
        else:   
            short_df = short_df.append(tweet_df.iloc[counter:counter + tweets_per_search])
            counter = counter + tweet_counts[i]
            
    csv_name = 'tweets_' + str(print_count) + '.csv'
    short_df.to_csv(csv_name, index=False)
    
    return short_df

Step 4: Run Code

In [11]:
#search for three terms, one for each rock, pop, and country. I updated the search to remove users I deemed "spam"*
search_list = ['#%23rockmusic -filter:retweets -_ArtistRack -coffeeradiogr -RockTalkFM -ZillichR',              \
                '#%23popmusic -filter:retweets -_ArtistRack -coffeeradiogr -YTPopMusicChan1 -Get_Heard_Today',  \
                '#%23countrymusic -filter:retweets -Until_Sana -videos_country -ombui -Rchemutai -ItsMwangiKelvyn']

#get at least 1000 tweets for each search term
num_tweets = 1000

#get tweets
full_tweet_list = get_tweets(search_list,num_tweets)  #fetch tweets from Twitter

running: #%23rockmusic -filter:retweets -_ArtistRack -coffeeradiogr -RockTalkFM -ZillichR
running: #%23popmusic -filter:retweets -_ArtistRack -coffeeradiogr -YTPopMusicChan1 -Get_Heard_Today
running: #%23countrymusic -filter:retweets -Until_Sana -videos_country -ombui -Rchemutai -ItsMwangiKelvyn


In [12]:
genres = ['rock','pop','country']                   #define names for search terms in search_list
tweet_dict = tweet_dict_pop(full_tweet_list,genres) #create dictionary with relevant tweet information
tweet_df = dict_to_df(tweet_dict,genres)            #convert dictionary to data frame

In [13]:
tweet_df['text'] = tweet_df['text'].apply(clean_tweets)  #clean tweet text
tweet_df['user'] = tweet_df['user'].apply(clean_tweets)  #clean tweet usernames
tweet_df['location'] = tweet_df['location'].apply(clean_tweets)  #clean tweet locations

In [14]:
tweet_df['sentiment'] = tweet_df['text'].apply(score_sentiment)  #analyze sentiment of tweet text

Step 5. Export Tweet Data

In [15]:
tweet_counts = []
for i in range(len(full_tweet_list)):
    tweet_counts.append(len(full_tweet_list[i]))

short_df = write_csv_length(tweet_df,100,tweet_counts)  #create .csv with 100 tweets
short_df = write_csv_length(tweet_df,1000,tweet_counts) #create .csv with 1000 tweets
short_df = write_csv_length(tweet_df,len(tweet_df),tweet_counts)  #create .csv with all tweets

*To perform the searches of #rockmusic, #popmusic, and #countrymusic, I had to make several modifications instead of just searching for the base hashtags. I used #%23 for the "hashtag" symbol, -filter:retweets to remove retweets and obtain original tweets only, and I filtered out users I deemed to be spam. I filtered these users in several ways. First, I looked at the .csv file with 100 tweets and removed users that occured an unusal number of times, especially in a row. These users generally posted the same tweet many times in a row. I found users like this for each search. Also, using my R-code I used the location summaries to determine unusual locations with high numbers of tweets. For example, I found three users frequently using #countrymusic in the locations of Kenya, Nairobi, and Nairobi,Kenya whose tweets had no relavance to country music.