# Pipeline

## Imports

In [1]:
import os , json , csv , datetime , dateutil.parser , unicodedata , time
from datetime import datetime , date , timedelta 
# classifier
import pandas as pd
from pandas import Timestamp as timestamp
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import classification_report 
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer

# web scraping
import requests
from bs4 import BeautifulSoup
#!pip3 install trafilatura
import trafilatura
from transformers import pipeline


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


## Variables

In [6]:
path = '../'

# descriptions training set
new_training_set = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/new_training_set.pkl')

# negative twitter training set
dh = pd.read_pickle(path+'TWITTER_SEARCHES/NEGATIVE/digital_humanities_2021.pkl')
music_company = pd.read_pickle(path+'TWITTER_SEARCHES/NEGATIVE/music_company_2021.pkl')
twitter_neg = pd.concat([dh, music_company])
twitter_neg = twitter_neg.loc[twitter_neg['lang'] == 'en']
twitter_neg['Target'] = '0'
twitter_neg = twitter_neg.sample(n=4379, random_state=56)
twitter_neg = twitter_neg[['tweet', 'Target']].reset_index(drop=True)

#positive twitter training set 
music_collection = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_collection.pkl')
song_dataset = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_song_dataset.pkl')
sound_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS/twitter_sound_archive.pkl')
digital_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_digital_archive.pkl')
music_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_archive.pkl')
digi_music_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_digital_music_archive.pkl')
midi_file = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_midi_file.pkl')
music_data = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_data.pkl')
music_research = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS/twitter_music_research.pkl')
music_dataset = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_dataset.pkl')
twitter_pos = pd.concat([sound_archive, music_collection, digital_archive, music_archive, song_dataset, digi_music_archive, midi_file, music_data, music_research, music_dataset])
twitter_pos = twitter_pos.loc[twitter_pos['lang'] == 'en']
twitter_pos['Target'] = '1'
twitter_pos = twitter_pos[['tweet', 'Target']].reset_index(drop=True)

# create the twitter training set
twitter_set = pd.concat([twitter_pos, twitter_neg])
twitter_set['Target'] = twitter_set['Target'].astype('int')
twitter_set = twitter_set.reset_index(drop=True)

#kw and sites to remove from url and title strings 
discard = ['youtu', '404', 'Not Found', 'bandcamp', 'ebay', 'It needs a human touch', 'Page not found', 'open.spotify.com', 'We\'re sorry...', 'Not Acceptable!', 'Access denied', '412 Error', 'goo.gl', 'instagr.am', 'soundcloud', 'apple.co', 'amzn', 'masterstillmusic', 'Facebook', 'facebook', 'sheetmusiclibrary.website', 'Unsupported browser', 'Last.fm', 'last.fm', 'amazon.com', 'tidal.com', 'tmblr.co', 'blogspot', 'dailymusicroll', 'PortalTaxiMusic', 'apple.news', 'yahoo.com', 'sheetmusicplus.com', 'musicnotes.com', 'musescore.com', 'etsy', 'nts.live', 'twitch.tv', 'YouTube', 'radiosparx.com', 'freemusicarchive.org', 'blastradio', 'opensea', 'mixcloud', 'catalog.works', 'nft', 'NFT', 'allmusic.com', 'foundation.app', 'Robot or human?', 'heardle', 'insession.agency', 'jobvite', 'career']

# twitter prediction set 
#prediction_twitter = pd.read_pickle(path+'TWITTER_SEARCHES/PREDICTIONS/digital_archive_22.pkl')
#prediction_twitter = prediction_twitter.loc[prediction_twitter['lang'] == 'en']


## Functions

In [7]:
def lr_training(t_input, t_feature, target, cv_int, score_type, filename, path):
    """ Create a text classifier based on Logistic regression and TF-IDF. Use cross validation 
    
    Parameters
    ----------
    t_input: list
        dataframe including the training set
    t_feature: list
        df column, text of tweet or description of the resource
    target: list
        df column, [0,1] values
    cv_int: int
        the number of cross validation folding
    score_type: str
        precision or recall
    filename: str
        model file name
    path: str
        parent folder
    """
    # TODO eda to define max_features=1000
      
    #count_vect = CountVectorizer()
    #tfidf_transformer = TfidfTransformer() 
    #x_train = tfidf_transformer.fit_transform(x_count)
    tfidf_transformer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, max_features=1000) 
    x_train = tfidf_transformer.fit_transform(t_input[t_feature])
    y_train = t_input[target].values
    model = LogisticRegressionCV(solver='liblinear', random_state=44, cv=cv_int, scoring=score_type)
    
    # export
    model.fit(x_train, y_train)
    export_model = f'MODELS/{filename}_model.pkl'
    export_vectorizer = f'MODELS/{filename}_vectorizer.pkl'
    pickle.dump(model, open(path+export_model, 'wb'))
    pickle.dump(tfidf_transformer, open(path+export_vectorizer, 'wb'))
    
    # report
    y_pred = cross_val_predict(model, x_train, y_train, cv=cv_int)
    report = classification_report(y_train, y_pred)
    print('report:', report, sep='\n')
    return model
    
    
def lr_predict(path, filename, p_input, p_feature):
    """ Classify text using a pickled model based on Logistic regression and TF-IDF.
    
    Parameters
    ----------
    p_input: list
        dataframe including the prediction set
    p_feature: list
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    """
    export_model = f'{path}MODELS/{filename}_model.pkl'
    export_vectorizer = f'{path}MODELS/{filename}_vectorizer.pkl'
    model = pickle.load(open(export_model, 'rb'))
    tfidf_transformer = pickle.load(open(export_vectorizer, 'rb'))
  
    #result = loaded_model.score(X_test, Y_test)
    #x_new_count = count_vect.transform(p_input[p_feature])
    x_predict = tfidf_transformer.transform(p_input[p_feature])
    y_predict = model.predict(x_predict)
    scores = model.decision_function(x_predict)
    probability = model.predict_proba(x_predict)
    
    #results = [r for r in y_predict]
    result = p_input.copy()
    result['Prediction'] = y_predict
    result['Score'] = scores
    result['Probability'] = probability[:,1]
    result['Input Length'] = result[p_feature].str.len()
    return result


def create_url(keyword, start_date, end_date, max_results):
        search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from
        #change params based on the endpoint you are using
        query_params = {'query': keyword,
                        'start_time': start_date,
                        'end_time': end_date,
                        'max_results': max_results,
                        'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                        'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source,entities',
                        'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                        'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                        'next_token': {}}
        return (search_url, query_params)
    
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()
 
def append_to_csv(json_response, fileName):
    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    
    #setup usernames via includes
    username = {user['id']: user['username'] for user in json_response['includes']['users']}
    
    #Loop through each tweet
    for tweet in json_response['data']:

        # 1. Username
        author_id = tweet['author_id']
        user = username[author_id]

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Language
        lang = tweet['lang']

        # 4. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        #5. URLs 
        if ('entities' in tweet) and ('urls' in tweet['entities']):
            for url in tweet['entities']['urls']:
                url = [url['expanded_url'] for url in tweet['entities']['urls'] if 'twitter.com' not in url['expanded_url']]
                url = ', '.join(url)
        else:
            url = " "
        
        #6. Tweet text
        text = tweet['text'] 
        
        # Assemble all data in a list
        res = [user, created_at, lang, like_count, quote_count, reply_count, retweet_count, text, url]

        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1    
    
    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 
    
def twitter_search(token, input_keywords, start, end, mresults, mcount, path='../'):
    
    # TODO filter tweets in english only OR tweak TF-IDF stopwords (lang detection)
    # TODO clean tweets from @ and emoji
    bearer_token = token
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    input_keywords   
    start_list = start
    end_list =  end
    max_results = mresults
    total_tweets = 0

    # Create file
    file_name = str(end[0]).replace(':','-').replace('/','-')
    csvFile = open(f'{path}TWITTER_SEARCHES/{file_name}.csv', "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    csvWriter.writerow(['user', 'created_at', 'lang', 'like_count', 'quote_count', 'reply_count','retweet_count','tweet', 'url'])
    csvFile.close()

    for i in range(0,len(start_list)):
        # Inputs
        count = 0 # Counting tweets per time period
        max_count = mcount # Max tweets per time period
        flag = True
        next_token = None
        
        while flag:
            # Check if max_count reached
            if count >= max_count:
                break
            print("-------------------")
            print("Token: ", next_token)
            for keyword in input_keywords:
                url = create_url(keyword, start_list[i],end_list[i], max_results)
                json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
                result_count = json_response['meta']['result_count']

                if 'next_token' in json_response['meta']:
                    # Save the token to use for next call
                    next_token = json_response['meta']['next_token']
                    print("Next Token: ", next_token)
                    if result_count is not None and result_count > 0 and next_token is not None:
                        print("Start Date: ", start_list[i])
                        append_to_csv(json_response, f'{path}TWITTER_SEARCHES/{file_name}.csv')
                        count += result_count
                        total_tweets += result_count
                        print("Total # of Tweets added: ", total_tweets)
                        print("-------------------")
                        time.sleep(5)                
                # If no next token exists
                else:
                    if result_count is not None and result_count > 0:
                        print("-------------------")
                        print("Start Date: ", start_list[i])
                        append_to_csv(json_response, f'{path}TWITTER_SEARCHES/{file_name}.csv')
                        count += result_count
                        total_tweets += result_count
                        print("Total # of Tweets added: ", total_tweets)
                        print("-------------------")
                        time.sleep(5)

                    #Since this is the final request, turn flag to false to move to the next time period.
                    flag = False
                    next_token = None
                time.sleep(5)
    print("Total number of results: ", total_tweets)
    
    df = pd.read_csv(f'{path}TWITTER_SEARCHES/{file_name}.csv', keep_default_na=False, dtype={"user": "string", "lang": "string", "tweet": "string", "url": "string"})
    
    # clean the tweet from meentions and hashtags
    df['tweet'].replace( { r"@[A-Za-z0-9_]+" : '' }, inplace= True, regex = True)
    df['tweet'].replace( { r"#[A-Za-z0-9_]+" : '' }, inplace= True, regex = True)
    
    # remove tweets that are not in english
    df = df[df['lang'].isin(['en'])]
    
    return df


def scrape_links(link_list):
    links = pd.DataFrame(columns=['Title', 'Description', 'URL'])
    summarizer = pipeline("summarization")
    
    for link in link_list:
        URL = link
        page = None
        ARTICLE = ''
        try:
            x = requests.head(URL)
            content_type = x.headers["Content-Type"] if "Content-Type" in x.headers else "None"
            if ("text/html" in content_type.lower()):
                page = requests.get(URL)
        except Exception:
            pass
        
        if page:
            soup = BeautifulSoup(page.content, "html.parser")
            title = ' '.join([t.text for t in soup.find('head').find_all('title')]).strip() \
                if soup and soup.find('head') and soup.find('body') is not None \
                else URL
            
            try:
                downloaded = trafilatura.fetch_url(URL)
                ARTICLE = trafilatura.extract(downloaded, include_comments=False, include_tables=True)
            except Exception:
                results = soup.find_all(['h1', 'p'])
                text = [result.text for result in results]
                ARTICLE = ' '.join(text)
            
            if len(ARTICLE) > 200:
                # text summarisation
                max_chunk = 500
                #removing special characters and replacing with end of sentence
                ARTICLE = ARTICLE.replace('.', '.<eos>')
                ARTICLE = ARTICLE.replace('?', '?<eos>')
                ARTICLE = ARTICLE.replace('!', '!<eos>')
                sentences = ARTICLE.split('<eos>')
                current_chunk = 0 
                chunks = []

                # split text to process
                for sentence in sentences:
                    if len(chunks) == current_chunk + 1: 
                        if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                            chunks[current_chunk].extend(sentence.split(' '))
                        else:
                            current_chunk += 1
                            chunks.append(sentence.split(' '))
                    else:
                        chunks.append(sentence.split(' '))

                for chunk_id in range(len(chunks)):
                    chunks[chunk_id] = ' '.join(chunks[chunk_id])
                try:
                    res = summarizer(chunks, max_length=120, min_length=30, do_sample=False)
                    # summary
                    text = ' '.join([summ['summary_text'] for summ in res])
                except Exception:
                    text = ARTICLE
                    continue
            else:
                text = ARTICLE
            print(URL,title,'\n',text)
            new_row = {'Title': title, 'Description': text, 'URL': URL.strip()}
            links = links.append(new_row, ignore_index=True)
    return links

## Training twitter and descriptions classifiers

This is a ONE TIME operation. The models are pickled and loaded later to predict new results

In [4]:
# one time training on twitter
#twitter_training_model = lr_training(twitter_set, 'tweet', 'Target', 10, 'precision', 'twitter', path)

# one time training on resources
#resource_training_model = lr_training(new_training_set, 'Description', 'Target', 10, 'f1','resources',path)

## Query Twitter

Calls Twitter API with the list of keywords and returns the table `prediction_twitter`

In [8]:
token = 'AAAAAAAAAAAAAAAAAAAAAJgsNAEAAAAAQcsgbUnOJJmqmU483%2F8x6n9V1i8%3Df0qaEo9cV1sWP4eyNQ6E9s8BiRjvFTSN9mSqithe8uIXSNP68x'

# a selection of keywords from KEYWORDS/bg_summary.csv
# keywords = ['sheet music','music archive','music collection','music library','black music','sound recording','midi file','early music','sound archive','music information','music history','music research','musical score','song dataset','library music','music oral','score collection','digitized score']
keywords = ['sheet music','music archive','music collection','music library']
input_keywords = [k+" -is:retweet" for k in keywords] 

today = date.today()
week_ago = today - timedelta(days=7)
start = [week_ago.strftime("%Y-%m-%dT%H:%M:%S.000Z")]
end = [today.strftime("%Y-%m-%dT%H:%M:%S.000Z")]

mresults = 50 # for each keyword
mcount = 50 # for each timespan (only one, last week, here)
path='../'

prediction_twitter = twitter_search(token, input_keywords, start, end, mresults, mcount, path)

-------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpytnyptfnmp8crqdvug4yd43o3sl9
Start Date:  2022-04-15T00:00:00.000Z
# of Tweets added from this response:  50
Total # of Tweets added:  50
-------------------
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpytnylm2odhy36cglhy9aujv0ndrx
Start Date:  2022-04-15T00:00:00.000Z
# of Tweets added from this response:  50
Total # of Tweets added:  100
-------------------
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpytnylls3idscmqvzus92fl40lv25
Start Date:  2022-04-15T00:00:00.000Z
# of Tweets added from this response:  50
Total # of Tweets added:  150
-------------------
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpytnyljdy63dd64qt5m2eplcb0131
Start Date:  2022-04-15T00:00:00.000Z
# of Tweets added from this response:  50
Total # of Tweets added:  200
-------------------
Total number of results:  200


## Classify tweets

In [9]:
# predictions
twitter_predictions = lr_predict(path, 'twitter', prediction_twitter, 'tweet')

tweet_predict_cv_df = twitter_predictions.copy().drop_duplicates()
tweet_predict_cv_df = tweet_predict_cv_df.loc[tweet_predict_cv_df['Prediction'] == 1]
tweet_predict_cv_df = tweet_predict_cv_df[~tweet_predict_cv_df.url.str.contains('|'.join(discard))]
tweet_predict_cv_df = tweet_predict_cv_df.sort_values(by='Score', ascending=False).reset_index(drop=True)
tweet_predict_cv_df = tweet_predict_cv_df[['tweet', 'Prediction', 'Score', 'Probability', 'Input Length', 'url']]
tweet_predict_cv_df

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
  tfidf_transformer = pickle.load(open(export_vectorizer, 'rb'))
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,url
0,Black Music Archive is so fucking SHADY!!!!!!!...,1,10.983706,0.999983,70,
1,Can you identify this song with no music? Chec...,1,9.511414,0.999926,105,
2,Black Feminity TV () Black Music Archive Calv...,1,9.382190,0.999916,113,
3,"""Christian Music Archive"" Artist of the Day: P...",1,7.917846,0.999636,158,http://www.ccmrewound.com
4,Who says pony music is dead? I've got a fuck...,1,7.191593,0.999248,189,
...,...,...,...,...,...,...
117,Grammy Museum To Feature 'The Power Of Women I...,1,0.156079,0.538941,118,http://dlvr.it/SP0VZf
118,I could have roleplayed tradwife if caught yo...,1,0.151471,0.537795,232,
119,This 2012 project may interest you. It's lon...,1,0.131311,0.532781,270,"http://RadioOccupy.tv, https://web.archive.org..."
120,YouTube music has a bigger library than both ...,1,0.110714,0.527650,145,


## Scrape URLS

In [10]:
# get links from positive tweets results
twitter_link_list = [link for link in tweet_predict_cv_df['url'] if 'twitter' not in link]

# scrape URL list
links_to_add = scrape_links(twitter_link_list)

# remove empty descriptions 
links_to_add = links_to_add[links_to_add.Description != ''].reset_index(drop=True)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


Downloading:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

RuntimeError: At least one of TensorFlow 2.0 or PyTorch should be installed. To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ To install PyTorch, read the instructions at https://pytorch.org/.

## Classify web resources

In [36]:
resources_predictions = lr_predict(path, 'resources', links_to_add, 'Description')

resources_preds_cv_df = resources_predictions.copy()
resources_preds_cv_df = resources_preds_cv_df.loc[resources_preds_cv_df['Prediction'] == 1]
resources_preds_cv_df = resources_preds_cv_df[~resources_preds_cv_df.Title.str.contains('|'.join(discard))]
resources_preds_cv_df = resources_preds_cv_df[~resources_preds_cv_df.URL.str.contains('|'.join(discard))]
resources_preds_cv_df.sort_values(by='Score', ascending=False).reset_index(drop=True)

Unnamed: 0,Title,Description,URL,Prediction,Score,Probability,Input Length
0,Third Eye TV | CCA Glasgow,"New music, sound poetry, archival video, inte...",https://www.cca-glasgow.com/programme/third-ey...,1,3.886536,0.979896,187
1,Journals,The National Jazz Archive journal collection ...,https://tinyurl.com/jazz-journals,1,2.931182,0.949367,210
