# Setup

In [1]:
!git clone https://github.com/harikc456/anime-subs-mapping.git

Cloning into 'anime-subs-mapping'...
remote: Enumerating objects: 1183, done.[K
remote: Counting objects: 100% (1183/1183), done.[K
remote: Compressing objects: 100% (1110/1110), done.[K
remote: Total 1183 (delta 165), reused 1091 (delta 73), pack-reused 0[K
Receiving objects: 100% (1183/1183), 9.23 MiB | 18.72 MiB/s, done.
Resolving deltas: 100% (165/165), done.


In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"

In [4]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark

# Read data

In [5]:
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql.functions import input_file_name, split, col, size, rank, \
col, levenshtein, length, regexp_replace, trim

In [6]:
labels = [('en_text', StringType()),
          ('jp_text', StringType()),
          ('google_translated', StringType()),
          ('score', DoubleType())]
schema = StructType([StructField(x[0], x[1], True) for x in labels])

df = spark.read.format('csv') \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .option("multiLine","true")\
    .option("header", True) \
    .schema(schema) \
    .load("/content/anime-subs-mapping/subs/*.csv")

In [7]:
# Add filename
df = df.withColumn("path", input_file_name())
df = df.withColumn("path_splitted", split("path","/"))
df = df.withColumn("filename", col("path_splitted")[size("path_splitted")-1])
df = df.drop("path").drop("path_splitted")

In [8]:
df.printSchema()

root
 |-- en_text: string (nullable = true)
 |-- jp_text: string (nullable = true)
 |-- google_translated: string (nullable = true)
 |-- score: double (nullable = true)
 |-- filename: string (nullable = true)



# Cleansing

In [9]:
df.createOrReplaceTempView("subs")
print(df.count())

149167


## Remove/Merge duplicate jp_text to eng_text mappings

In [10]:
# Same as SQL code, but using spark functions
# window = Window.partitionBy(df['jp_text']).orderBy(df['score'].desc())
# de_dup_df = df.select('*', rank().over(window).alias('rank')).filter(col('rank')==1)
df = spark.sql("select *,RANK() OVER (PARTITION BY jp_text ORDER BY score) AS RANK from subs")
df = df.filter(col("RANK")==1).dropDuplicates(['jp_text']).drop("RANK")
print(df.count())

140166


## Remove rows where en_text has less than 3 characters length

In [11]:
df = df.filter(length(col('en_text'))>3)
print(df.count())

139773


## Remove rows with score 0 and number of words greater than 1

In [12]:
df = df.filter((col('score')!=0.0) | ((col('score')==0.0) & (size(split(col('en_text'), ' '))<=1)))
print(df.count())

130349


## Remove words inside {} and <> in jp_text and en_text

In [18]:
print(df.where('en_text like "%{%}%"').count())
print(df.where('jp_text like "%{%}%"').count())
print(df.where('en_text like "%<%>%"').count())
print(df.where('jp_text like "%<%>%"').count())
print(df.select("en_text").where('jp_text == "OKってこと？"').show(truncate=False))
print(df.select("jp_text").where('en_text == "No no!"').show(truncate=False))

0
2852
1138
186
+---------------------------+
|en_text                    |
+---------------------------+
|<i>... it's all right?!</i>|
+---------------------------+

None
+-------------+
|jp_text      |
+-------------+
|{\a6}いやいや|
+-------------+

None


In [26]:
df = df.withColumn('en_text', trim(regexp_replace('en_text', r'\{.*?\}', ''))) \
       .withColumn('jp_text', trim(regexp_replace('jp_text', r'\{.*?\}', ''))) \
       .withColumn('en_text', trim(regexp_replace('en_text', r'<.*?>', ''))) \
       .withColumn('jp_text', trim(regexp_replace('jp_text', r'<.*?>', '')))

In [27]:
print(df.where('en_text like "%{%}%"').count())
print(df.where('jp_text like "%{%}%"').count())
print(df.where('en_text like "%<%>%"').count())
print(df.where('jp_text like "%<%>%"').count())
print(df.select("en_text").where('jp_text == "OKってこと？"').show(truncate=False))
print(df.select("jp_text").where('en_text == "No no!"').show(truncate=False))

0
0
0
75
+--------------------+
|en_text             |
+--------------------+
|... it's all right?!|
+--------------------+

None
+--------+
|jp_text |
+--------+
|いやいや|
+--------+

None


# EDA

In [28]:
df.where("TRIM(LOWER(en_text))!=TRIM(LOWER(google_translated))").show(50, False)

+--------------------------------------------------------------------------------------+--------------------------------------------------------------+-------------------------------------------------------------------------------------------+-------------------+----------------------------------+
|en_text                                                                               |jp_text                                                       |google_translated                                                                          |score              |filename                          |
+--------------------------------------------------------------------------------------+--------------------------------------------------------------+-------------------------------------------------------------------------------------------+-------------------+----------------------------------+
|That'll be 1500 yen.                                                                  |\t1500円になります   

In [29]:
df.withColumn("levenshtein", levenshtein(col("en_text"), col("google_translated"))).select("en_text", "google_translated","levenshtein").filter(col("levenshtein")<5).show(50,False)

+-----------------------------+-------------------------+-----------+
|en_text                      |google_translated        |levenshtein|
+-----------------------------+-------------------------+-----------+
|Sup.                         |Hey                      |4          |
|Wait a minute.               |Wait a minute            |1          |
|Nope.                        |Not                      |3          |
|Meow.                        |Merm                     |3          |
|Is she a monster?            |Is it a monster?         |3          |
|Yoshitake!                   |Yoshitake                |1          |
|Did something happen?        |Did something happen     |1          |
|Excuse me.                   |Excuse me                |1          |
|I was confessed to at school.|I was confessed at school|4          |
|Is it nearby?                |Is it nearby?            |0          |
|Yuuki!                       |Yabe!                    |4          |
|December.          