In [1]:
# IMPORT LIBRARIES FOR REGEX, PANDAS, NUMPY, SQLITE3, MATPLOTLIB, SEABORN, AND WARNINGS (TO IGNORE VISUALIZATION RESULT WARNING
import re
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# IMPORT LIBRARY FOR FLASK AND SWAGGER
from flask import Flask, jsonify, request
from flasgger import Swagger, LazyString, LazyJSONEncoder
from flasgger import swag_from
from wordcloud import WordCloud

# DEFAULT FLASK AND SWAGGER DEFAULT SETTING
app = Flask(__name__)
app.json_encoder = LazyJSONEncoder
swagger_template = dict(
info = {
    'title': LazyString(lambda: 'API Documentation for Data Processing and Modeling'),
    'version': LazyString(lambda: '1.0.0'),
    'description': LazyString(lambda: 'Dokumentasi API untuk Data Processing dan Modeling'),
    },
    host = LazyString(lambda: request.host)
)
swagger_config = {
    "headers": [],
    "specs": [
        {
            "endpoint": 'docs',
            "route": '/docs.json',
        }
    ],
    "static_url_path": "/flasgger_static",
    "swagger_ui": True,
    "specs_route": "/docs/"
}
swagger = Swagger(app, template=swagger_template,             
                  config=swagger_config)

# IMPORT ABUSIVE.CSV AND NEW_KAMUSALAY.CSV
df_abusive = pd.read_csv('abusive.csv')
abusive = df_abusive.to_numpy()

df_alay = pd.read_csv('new_kamusalay.csv', encoding='latin-1', header=None)
df_alay.columns = ["alay","arti"]
alay = df_alay["alay"].values.tolist()
arti = df_alay["arti"].values.tolist()

# DEFINE ENDPOINTS: BASIC GET
@swag_from("C:/Users/ACER/Binar DSC/docs/hello_world.yml", methods=['GET'])
@app.route('/', methods=['GET'])
def hello_world():
    json_response = {
        'status_code': 200,
        'description': "Menyapa Hello World",
        'data': "Hello World",
    }
    response_data = jsonify(json_response)
    return response_data

# DEFINE ENDPOINTS: POST FOR TEXT PROCESSING FROM TEXT INPUT
@swag_from("C:/Users/ACER/Binar DSC/docs/text_processing.yml", methods=['POST'])
@app.route('/text-processing', methods=['POST'])
def text_processing():
    
    text = request.form.get('text')
    
    json_response = {
        'status_code': 200,
        'description': "Teks yang sudah diproses",
        'data': re.sub(r'[^a-zA-Z0-9]',' ', text)
    }
    
    response_data = jsonify(json_response)
    return response_data

# DEFINE ENDPOINTS: POST FOR TEXT PROCESSING FROM FILE
@swag_from("C:/Users/ACER/Binar DSC/docs/text_processing_file.yml", methods=['POST'])
@app.route('/text-processing-file', methods=['POST'])
def text_processing_file():
    # USING REQUEST TO GET FILE THAT HAS BEEN POSTED FROM API ENDPOINT
    file = request.files.get('file')
    
    # IMPORT FILE OBJECT
    df_tweet = pd.read_csv(file, encoding='latin-1').drop_duplicates().dropna()
    
    # DATA CLEANSING
    tweet = df_tweet['Tweet'].str.replace(r'http\S+|www.\S+', '', regex=True) #remove url
    tweet = tweet.str.replace(r'\\x[0-9a-z]{2}', '', regex=True) #remove emoticon
    tweet = tweet.str.replace(r'\\n', '', regex=True) #remove newline
    tweet = tweet.str.replace(r'&amp;', '', regex=True) #remove ampersand
    tweet = tweet.str.replace(r'[^a-zA-Z0-9.,!?;/ ]', '', regex=True)
    tweet = tweet.values.tolist()

    tweet_lingo = ["USER", "user", "URL", "url", "RT"]

    for i in range(len(tweet)):
        for word in tweet[i].split():
            if word in tweet_lingo:
                tweet[i] = tweet[i].replace(word, "")
    
    # SUBSTITUTE ALAY WORDS WITH THEIR MEANINGS
    for i in range(len(tweet)):
        sentence = tweet[i].lower()

        for j in range(len(sentence)):
            for k in range(len(alay)):
                for word in (re.split('[.,!?;/ ]', sentence)):
                    if word == alay[k]:
                        sentence = sentence.replace(word, arti[k])

        tweet[i] = ''.join(sentence)
    
    count_abusive = [] * len(tweet)
    tweet_type = [] * len(tweet)

    # REMOVE ABUSIVE WORDS
    for a in range(len(tweet)):
        b = 0
        for word in (re.split('[.,!?;/ ]', tweet[a])):
            if word in abusive:
                b = b+1
                tweet[a] = tweet[a].replace(word, "*" * len(word))
        count_abusive.append(b) #count total abusive words in tweet
        # SEPARATE TWEET BY IF THERE ARE ABUSIVE WORDS OR NOT
        if b > 0:
            tweet_type.append('Abusive')
        elif b == 0:
            tweet_type.append('Not Abusive')
    
    # TOTAL_CHAR = TOTAL CHARACTERS IN TWEET BEFORE CLEANSING | TOTAL_WORDS = TOTAL WORDS IN TWEET BEFORE CLEANSING
    df_tweet['total_char'] = df_tweet['Tweet'].apply(len)
    df_tweet['total_words'] = df_tweet['Tweet'].apply(lambda x: len(x.split()))
    
    # TWEET AFTER CLEANSING
    df_tweet['cleaned_tweet'] = pd.DataFrame(tweet)
    
    # TOTAL_CHAR2 = TOTAL CHARACTERS IN TWEET AFTER CLEANSING | TOTAL_WORDS2 = TOTAL WORDS IN TWEET AFTER CLEANSING
    df_tweet['total_char2'] = df_tweet['cleaned_tweet'].apply(len)
    df_tweet['total_words2'] = df_tweet['cleaned_tweet'].apply(lambda x: len(x.split()))
    
    # APPLY THE FUNCTION TO COUNT ABUSIVE WORDS, AND CREATE A NEW COLUMN BASED OFF OF IT
    df_tweet['count_abusive'] = pd.DataFrame(count_abusive)
    df_tweet['tweet_type'] = pd.DataFrame(tweet_type)
    
    # CONNECT / CREATE NEW DATABASE AND CREATE NEW TABLE CONSISTING LISTED TABLES
    conn = sqlite3.connect('database_project.db')
    q_create_table = """
    create table if not exists df_tweet (Tweet varchar(255), total_char int, total_words int, cleaned_tweet varchar(255), total_char2 int, total_words2 int, tweet_type varchar(255));
    """
    conn.execute(q_create_table)
    conn.commit()
    
    # CHECK WHETHER TABLE ALREADY HAS DATA IN IT (TABLE HAS ROWS OF DATA IN IT)
    cursor = conn.execute("select count(*) from df_tweet")
    num_rows = cursor.fetchall()
    num_rows = num_rows[0][0]
    
    #  DO DATA INSERTIONS IF TABLE HAS NO DATA IN IT    
    if num_rows == 0:
    # DO ITERATIONS TO INSERT DATA (EACH ROW) FROM FINAL DATAFRAME (df_tweet)
        for i in range(len(df_tweet)):
            tweet = df_tweet['Tweet'].iloc[i]
            total_char = int(df_tweet['total_char'].iloc[i])
            total_words = int(df_tweet['total_words'].iloc[i])
            cleaned_tweet = df_tweet['cleaned_tweet'].iloc[i]
            total_char2 = int(df_tweet['total_char2'].iloc[i])
            total_words2 = int(df_tweet['total_words2'].iloc[i])
            tweet_type = df_tweet['tweet_type'].iloc[i]
    
            q_insertion = "insert into df_tweet (Tweet, total_char, total_words, cleaned_tweet, total_char2, total_words2, tweet_type) values (?,?,?,?,?,?,?)"
            conn.execute(q_insertion,(tweet,total_char,total_words,cleaned_tweet,total_char2,total_words2,tweet_type))
            conn.commit()    
    
    conn.close()
    
    # VISUALIZE THE PERCENTAGE OF ABUSIVE TWEETS
    plt.figure()
    df_tweet.groupby('tweet_type').tweet_type.count().plot(kind ="pie", title='Persentase Abusive Tweets', autopct='%.2f%%', labels=None, ylabel='', legend=True)
    plt.savefig('pie_abusive.jpeg')
    
    # VISUALIZE THE NUMBER OF ABUSIVE WORDS
    plt.figure()
    countplot = sns.countplot(data=df_tweet, x="count_abusive")
    plt.title('Count of Estimated Number of Abusive Words')
    plt.xlabel('Estimated Number of Abusive Words')
    plt.savefig('new_countplot.jpeg')
    
    # VISUALIZE THE NUMBER OF WORDS
    plt.figure()
    boxplot = sns.boxplot(data=df_tweet, x="total_words2")
    plt.title('Number of Words Boxplot (after tweet cleansing)')
    plt.xlabel('')
    plt.savefig('boxplot_total_words.jpeg')
    
    # VISUALIZE WORDCLOUD
    plt.figure()
    text = ' '.join(df_tweet['cleaned_tweet'])
    wordcloud = WordCloud(collocations=False).generate(text)

    plt.imshow(wordcloud)
    plt.axis("off")
    plt.savefig('wordcloud.jpeg')
    
    # VISUALIZE SCATTERPLOT
    plt.figure()
    sns.scatterplot(data=df_tweet, x='total_char2', y='total_words2')
    plt.savefig('scatter,jpeg')
    
    # OUTPUT THE RESULT IN JSON FORMAT
    json_response = {
        'status_code': 200,
        'description': "Teks yang sudah diproses",
        'data': list(df_tweet['cleaned_tweet'])
    }
    
    response_data = jsonify(json_response)
    return response_data

if __name__ == "__main__":
    app.run()

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [02/Oct/2023 21:49:57] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [02/Oct/2023 21:50:02] "GET /docs/ HTTP/1.1" 200 -
127.0.0.1 - - [02/Oct/2023 21:50:02] "GET /flasgger_static/swagger-ui.css HTTP/1.1" 304 -
127.0.0.1 - - [02/Oct/2023 21:50:02] "GET /flasgger_static/lib/jquery.min.js HTTP/1.1" 304 -
127.0.0.1 - - [02/Oct/2023 21:50:02] "GET /flasgger_static/swagger-ui-standalone-preset.js HTTP/1.1" 304 -
127.0.0.1 - - [02/Oct/2023 21:50:03] "GET /flasgger_static/swagger-ui-bundle.js HTTP/1.1" 304 -
127.0.0.1 - - [02/Oct/2023 21:50:03] "GET /docs.json HTTP/1.1" 200 -
